Merge branch 'main' into gaudi_backend_pa

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
2025-09-09 11:24:53 +00:00 · 2025-04-10 18:20:28 -07:00 · 2025-04-10 18:20:28 -07:00 · 610dd200e5
commit 610dd200e5
parent c55a8caea2 9a8d0462e1
61 changed files with 3105 additions and 3396 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4650,7 +4650,7 @@ dependencies = [

 [[package]]
 name = "text-generation-backends-trtllm"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "async-trait",
 "clap 4.5.32",
@ -4671,7 +4671,7 @@ dependencies = [

 [[package]]
 name = "text-generation-benchmark"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "average",
 "clap 4.5.32",
@ -4691,7 +4691,7 @@ dependencies = [

 [[package]]
 name = "text-generation-client"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@ -4709,7 +4709,7 @@ dependencies = [

 [[package]]
 name = "text-generation-launcher"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "clap 4.5.32",
 "ctrlc",
@ -4730,7 +4730,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "anyhow",
 "async-stream",
@ -4782,7 +4782,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router-llamacpp"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "async-trait",
 "bindgen 0.71.1",
@ -4800,7 +4800,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router-v2"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
@ -4849,7 +4849,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router-v3"
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -21,7 +21,7 @@ default-members = [
 resolver = "2"

 [workspace.package]
-version = "3.2.1-dev0"
+version = "3.2.3-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/11
+++ b/11
@ -65,7 +65,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
 ENV PATH="$PATH:/root/.local/bin"
 RUN uv python install ${PYTHON_VERSION}
-RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} pip setuptools packaging
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} torchvision pip setuptools packaging
 ENV VIRTUAL_ENV=/usr/src/.venv/
 ENV PATH="$PATH:/usr/src/.venv/bin/"

@ -165,8 +165,9 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        git \
        && rm -rf /var/lib/apt/lists/*

-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-ENV PATH="$PATH:/root/.local/bin"
+# RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# ENV PATH="$PATH:/root/.local/bin"
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
 # Install flash-attention dependencies
 # RUN pip install einops --no-cache-dir

@ -183,12 +184,12 @@ COPY server server
 COPY server/Makefile server/Makefile
 ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
-	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project --active && \
+	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --no-install-project --active && \
    make gen-server-raw && \
    kernels download .

 RUN cd server && \
-    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
+    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --active --python=${PYTHON_VERSION} && \
    uv pip install nvidia-nccl-cu12==2.25.1 && \
    pwd && \
    text-generation-server --help
--- a/379
+++ b/379
@ -41,303 +41,244 @@ COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt --frozen

-# Text Generation Inference base image for RoCm
-FROM rocm/dev-ubuntu-22.04:6.2 AS base
+FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base

+ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="3a585126"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.11
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    build-essential \
-    ca-certificates \
-    ccache \
-    curl \
-    git \
-    make \
-    libmsgpack-dev \
-    libssl-dev \
-    llvm-dev \
-    g++ \
-    # Needed to build VLLM & flash.
-    rocthrust-dev \
-    hipsparse-dev \
-    hipblas-dev \
-    hipcub-dev \
-    rocblas-dev \
-    hiprand-dev \
-    hipfft-dev \
-    rocrand-dev \
-    miopen-hip-dev \
-    hipsolver-dev \
-    rccl-dev \
-    cmake \
-    python3.11-venv && \
-    rm -rf /var/lib/apt/lists/*
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git \
+        ninja-build \
+        cmake \
+        software-properties-common \
+        python3.11-dev \
+        python3.11-venv && \
+        rm -rf /var/lib/apt/lists/*

-# Keep in sync with `server/pyproject.toml
-ARG MAMBA_VERSION=23.1.0-1
-ARG PYTHON_VERSION='3.11.10'
-# Automatically set by buildx
-ARG TARGETPLATFORM
-ENV PATH=/opt/conda/bin:$PATH
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+ENV PATH="$PATH:/root/.local/bin"
+RUN uv python install ${PYTHON_VERSION}
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"

-ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython

-# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
-# Install mamba
-# translating Docker's TARGETPLATFORM into mamba arches
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
-         *)              MAMBA_ARCH=x86_64   ;; \
-    esac && \
-    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
-RUN chmod +x ~/mambaforge.sh && \
-    bash ~/mambaforge.sh -b -p /opt/conda && \
-    mamba init && \
-    rm ~/mambaforge.sh
-
-# RUN conda install intel::mkl-static intel::mkl-include
-# Install pytorch
-# On arm64 we exit with an error code
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  exit 1 ;; \
-         *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
-    esac && \
-    /opt/conda/bin/conda clean -ya
-
-# Install flash-attention, torch dependencies
-RUN python3 -m pip install --upgrade pip uv && pip install numpy einops ninja joblib msgpack cmake --no-cache-dir && rm -rf /var/lib/apt/lists/*
-
-RUN conda install mkl=2021
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/lib/python3.11/site-packages/torch/lib:/opt/conda/lib/
-
-
-ARG COMMON_WORKDIR=/
-WORKDIR ${COMMON_WORKDIR}
-
-
-# Install HIPBLASLt
 FROM base AS build_hipblaslt
-ARG HIPBLASLT_BRANCH="e6da924"
-RUN git clone https://github.com/ROCm/hipBLASLt.git \
-    && cd hipBLASLt \
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN . .venv/bin/activate && cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN . .venv/bin/activate && cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
-    && SCCACHE_IDLE_TIMEOUT=1800 ./install.sh --architecture ${PYTORCH_ROCM_ARCH} --legacy_hipblas_direct \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install

-FROM scratch AS export_hipblaslt
-ARG COMMON_WORKDIR
-COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb /
-
-# RCCL build stages
 FROM base AS build_rccl
-ARG RCCL_BRANCH="rocm-6.2.0"
-RUN git clone https://github.com/ROCm/rccl \
-    && cd rccl \
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN . .venv/bin/activate && cd rccl \
    && git checkout ${RCCL_BRANCH} \
    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-FROM scratch AS export_rccl
-ARG COMMON_WORKDIR
-COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb /
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install

-# Triton build stages
 FROM base AS build_triton
-ARG TRITON_BRANCH="e192dba"
-ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-RUN python3 -m pip install ninja cmake wheel pybind11 && git clone ${TRITON_REPO} \
-    && cd triton \
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN . .venv/bin/activate && cd triton \
    && git checkout ${TRITON_BRANCH} \
    && cd python \
    && python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch AS export_triton
-ARG COMMON_WORKDIR
-COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl /
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install

-# # AMD-SMI build stages
 FROM base AS build_amdsmi
-RUN cd /opt/rocm/share/amd_smi \
+RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \
    && pip wheel . --wheel-dir=dist
-FROM scratch AS export_amdsmi
-COPY --from=build_amdsmi /opt/rocm/share/amd_smi/dist/*.whl /
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install

+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN . .venv/bin/activate && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install

-FROM base as build_pytorch
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl

-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-    fi
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN . .venv/bin/activate && cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter

-ARG BUILD_ENVIRONMENT=pytorch-linux-jammy-rocm6.2-py3.11
-ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-
-# A commit to fix the output scaling factor issue in _scaled_mm
-# Not yet in 2.5.0-rc1
-ARG PYTORCH_BRANCH="cedc116"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
-ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
-
-RUN git clone ${PYTORCH_REPO} pytorch \
-    && cd pytorch && git checkout ${PYTORCH_BRANCH} && git submodule update --init --recursive \
-    && pip install -r requirements.txt --no-cache-dir  \
-    && python tools/amd_build/build_amd.py \
-    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist
-FROM scratch as export_pytorch
-ARG COMMON_WORKDIR
-COPY --from=build_pytorch ${COMMON_WORKDIR}/pytorch/dist/*.whl /
-
-FROM base AS install_deps
-
-ARG COMMON_WORKDIR
-
-# Install hipblaslt
-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-    fi
-
-RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
-    if ls /install/*.deb; then \
-        dpkg -i /install/*.deb \
-        # RCCL needs to be installed twice
-        && dpkg -i /install/*.deb \
-        && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-        && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \
-    fi
-
-RUN --mount=type=bind,from=export_triton,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y triton \
-        && pip install /install/*.whl; \
-    fi
-
-RUN --mount=type=bind,from=export_amdsmi,src=/,target=/install \
-    # Preemptively uninstall to prevent pip same-version no-installs
-    pip uninstall -y amdsmi \
-    && pip install /install/*.whl;
-
-RUN --mount=type=bind,from=export_pytorch,src=/,target=/install \
-    if ls /install/*.whl; then \
-        # Preemptively uninstall to prevent pip same-version no-installs
-        pip uninstall -y torch torchvision \
-        && pip install /install/*.whl; \
-    fi
-
-FROM install_deps AS kernel-builder
+RUN rm -rf /var/lib/apt/lists/*

+FROM final AS kernel-builder
 # # Build vllm kernels
 FROM kernel-builder AS vllm-builder
-WORKDIR /usr/src

 COPY server/Makefile-vllm Makefile
-RUN pip install setuptools_scm
+RUN . .venv/bin/activate && pip install setuptools_scm

 # Build specific version of vllm
-RUN make build-vllm-rocm
-
-# Build Flash Attention v2 kernels
-FROM kernel-builder AS flash-att-v2-builder
-WORKDIR /usr/src
-
-COPY server/Makefile-flash-att-v2 Makefile
-
-# Build specific version of flash attention v2
-RUN make build-flash-attention-v2-rocm
+RUN . .venv/bin/activate && make build-vllm-rocm

 # Build Transformers CUDA kernels (gpt-neox and bloom)
 FROM kernel-builder AS custom-kernels-builder
-WORKDIR /usr/src
 COPY server/custom_kernels/ .
-RUN python setup.py build
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist

 # Build exllama kernels
 FROM kernel-builder AS exllama-kernels-builder
-WORKDIR /usr/src
 COPY server/exllama_kernels/ .
-
-RUN python setup.py build
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist

 # Build exllama v2 kernels
 FROM kernel-builder AS exllamav2-kernels-builder
-WORKDIR /usr/src
 COPY server/exllamav2_kernels/ .
-
-RUN python setup.py build
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist

 FROM kernel-builder AS marlin-kernels
-WORKDIR /usr/src
 ENV MARLIN_KERNELS_BRANCH=v0.3.6
 ENV VLLM_TARGET_DEVICE=rocm
-RUN git clone https://github.com/danieldk/marlin-kernels.git && \
+RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \
    cd marlin-kernels && \
    git checkout ${MARLIN_KERNELS_BRANCH} && \
-    python setup.py install
+    python3 setup.py bdist_wheel --dist-dir=dist

 FROM kernel-builder AS moe-kernels
-WORKDIR /usr/src
 ENV MOE_KERNELS_BRANCH=v0.8.2
 ENV VLLM_TARGET_DEVICE=rocm
-RUN git clone https://github.com/danieldk/moe-kernels.git && \
+RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \
    cd moe-kernels && \
    git checkout ${MOE_KERNELS_BRANCH} && \
-    python setup.py install
+    python3 setup.py bdist_wheel --dist-dir=dist

-FROM install_deps AS base-copy
+FROM final AS base-copy

 # Text Generation Inference base env
 ENV HF_HOME=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

-# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from marlin kernels
-COPY --from=marlin-kernels /usr/src/marlin-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
-
-# Copy build artifacts from moe kernels
-COPY --from=moe-kernels /usr/src/moe-kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+ENV VIRTUAL_ENV=/app/.venv/
+ENV PATH="$PATH:/app/.venv/bin/"

 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
-ENV UV_SYSTEM_PYTHON=1
 RUN cd server && \
-    pip install -U pip uv && \
-	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --no-install-project && \
-    . ./.venv/bin/activate && \
+    uv pip install grpcio-tools mypy-protobuf && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \
    make gen-server-raw
-
 RUN cd server && \
-	uv sync --frozen --extra gen --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines && \
-    . ./.venv/bin/activate && \
    pwd && \
    text-generation-server --help

+RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \
+    uv pip install /install/*.whl
+
 # Install benchmarker
 COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
 COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
 COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"

 # AWS Sagemaker compatible image
 FROM base AS sagemaker
@ -368,4 +309,6 @@ COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh

 ENTRYPOINT ["/tgi-entrypoint.sh"]
-CMD ["--json-output"]
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib"
+ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages
+# CMD ["--json-output"]
--- a/README.md
+++ b/README.md
@ -84,7 +84,7 @@ model=HuggingFaceH4/zephyr-7b-beta
 volume=$PWD/data

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model
 ```

 And then you can make requests like
@ -121,7 +121,7 @@ curl localhost:8080/v1/chat/completions \

 **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

-**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1-rocm --model-id $model` instead of the command above.
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3-rocm --model-id $model` instead of the command above.

 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@ -152,7 +152,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 token=<your cli READ token>

 docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model
+    ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model
 ```

 ### A note on Shared Memory (shm)
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@ -1,6 +1,6 @@
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
-root_dir := "${mkfile_dir}/../.."
+root_dir := ${mkfile_dir}/../..

 HABANA_VERSION := 1.20.0
 PYTORCH_VERSION := 2.6.0
@ -47,3 +47,16 @@ local-dev-install: install-dependencies
 		make install-server && \
 		make install-router && \
 		make install-launcher'
+
+# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
+run-integration-tests:
+	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
+
+# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
+capture-expected-outputs-for-integration-tests:
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests/capture_expected_outputs.py
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@ -96,3 +96,47 @@ curl 127.0.0.1:8080/generate \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
     -H 'Content-Type: application/json'
 ```
+
+### Integration tests
+
+To run the integration tests, you need to first build the image:
+```bash
+make -C backends/gaudi image
+```
+
+Then run the following command to run the integration tests:
+```bash
+make -C backends/gaudi run-integration-tests
+```
+
+To capture the expected outputs for the integration tests, you can run the following command:
+```bash
+make -C backends/gaudi capture-expected-outputs-for-integration-tests
+```
+
+#### How the integration tests works
+The integration tests works as follows:
+
+1. Start a tgi server in a container, similar to the command:
+```bash
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
+```
+
+2. Do a /generate request to the server, similar to the command:
+```bash
+curl 127.0.0.1:8080/generate \
+     -X POST \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+     -H 'Content-Type: application/json'
+```
+
+3. Check the output of the server against the expected output:
+```python
+assert curl_output == expected_output
+```
+
+This is the repeated for a set of models and configurations.
--- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py
+++ b/backends/gaudi/server/integration-tests/capture_expected_outputs.py
@ -0,0 +1,85 @@
+import json
+import os
+from typing import Dict, Any, Generator
+
+import pytest
+from test_model import TEST_CONFIGS
+
+UNKNOWN_CONFIGS = {
+    name: config
+    for name, config in TEST_CONFIGS.items()
+    if config["expected_greedy_output"] == "unknown"
+    or config["expected_batch_output"] == "unknown"
+}
+
+
+@pytest.fixture(scope="module", params=UNKNOWN_CONFIGS.keys())
+def test_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    test_config = UNKNOWN_CONFIGS[request.param]
+    test_config["test_name"] = request.param
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config):
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, test_config, test_name) -> Generator:
+    """Fixture that provides a TGI service for testing."""
+    with launcher(test_config["model_id"], test_name) as service:
+        yield service
+
+
+@pytest.mark.asyncio
+async def test_capture_expected_outputs(tgi_service, test_config, test_name):
+    """Test that captures expected outputs for models with unknown outputs."""
+    print(f"Testing {test_name} with {test_config['model_id']}")
+
+    # Wait for service to be ready
+    await tgi_service.health(1000)
+    client = tgi_service.client
+
+    # Test single request (greedy)
+    print("Testing single request...")
+    response = await client.generate(
+        test_config["input"],
+        max_new_tokens=32,
+    )
+    greedy_output = response.generated_text
+
+    # Test multiple requests (batch)
+    print("Testing batch requests...")
+    responses = []
+    for _ in range(4):
+        response = await client.generate(
+            test_config["input"],
+            max_new_tokens=32,
+        )
+        responses.append(response.generated_text)
+
+    # Store results in a JSON file
+    output_file = "server/integration-tests/expected_outputs.json"
+    results = {}
+
+    # Try to load existing results if file exists
+    if os.path.exists(output_file):
+        with open(output_file, "r") as f:
+            results = json.load(f)
+
+    # Update results for this model
+    results[test_name] = {
+        "model_id": test_config["model_id"],
+        "input": test_config["input"],
+        "greedy_output": greedy_output,
+        "batch_outputs": responses,
+        "args": test_config["args"],
+    }
+
+    # Save updated results
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"\nResults for {test_name} saved to {output_file}")
--- a/backends/gaudi/server/integration-tests/conftest.py
+++ b/backends/gaudi/server/integration-tests/conftest.py
@ -0,0 +1,292 @@
+import asyncio
+import contextlib
+import os
+import shlex
+import subprocess
+import sys
+import threading
+import time
+from tempfile import TemporaryDirectory
+from typing import List
+import socket
+
+import docker
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from loguru import logger
+from test_model import TEST_CONFIGS
+from text_generation import AsyncClient
+from text_generation.types import Response
+
+# Use the latest image from the local docker build
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
+
+assert (
+    HF_TOKEN is not None
+), "HF_TOKEN is not set, please set it as some models are gated and thus the test will fail without it"
+
+if DOCKER_VOLUME is None:
+    logger.warning(
+        "DOCKER_VOLUME is not set, this will lead to the tests redownloading the models on each run, consider setting it to speed up testing"
+    )
+
+LOG_LEVEL = os.getenv("LOG_LEVEL", "info")
+
+BASE_ENV = {
+    "HF_HUB_ENABLE_HF_TRANSFER": "1",
+    "LOG_LEVEL": LOG_LEVEL,
+    "HF_TOKEN": os.getenv("HF_TOKEN", None),
+}
+
+
+HABANA_RUN_ARGS = {
+    "runtime": "habana",
+    "ipc_mode": "host",
+    "cap_add": ["sys_nice"],
+}
+
+logger.add(
+    sys.stderr,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+    level="INFO",
+)
+
+
+def stream_container_logs(container, test_name):
+    """Stream container logs in a separate thread."""
+    try:
+        for log in container.logs(stream=True, follow=True):
+            print(
+                f"[TGI Server Logs - {test_name}] {log.decode('utf-8')}",
+                end="",
+                file=sys.stderr,
+                flush=True,
+            )
+    except Exception as e:
+        logger.error(f"Error streaming container logs: {str(e)}")
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        start_time = time.time()
+        logger.info(f"Starting health check with timeout of {timeout}s")
+
+        for attempt in range(timeout):
+            if not self._inner_health():
+                logger.error("Launcher crashed during health check")
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                await self.client.generate("test")
+                elapsed = time.time() - start_time
+                logger.info(f"Health check passed after {elapsed:.1f}s")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError) as e:
+                if attempt == timeout - 1:
+                    logger.error(f"Health check failed after {timeout}s: {str(e)}")
+                    raise RuntimeError(f"Health check failed: {str(e)}")
+                if attempt % 10 == 0 and attempt != 0:  # Only log every 10th attempt
+                    logger.debug(
+                        f"Connection attempt {attempt}/{timeout} failed: {str(e)}"
+                    )
+                time.sleep(1)
+            except Exception as e:
+                logger.error(f"Unexpected error during health check: {str(e)}")
+                # Get full traceback for debugging
+                import traceback
+
+                logger.error(f"Full traceback:\n{traceback.format_exc()}")
+                raise
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        try:
+            container = self.docker_client.containers.get(self.container_name)
+            status = container.status
+            if status not in ["running", "created"]:
+                logger.warning(f"Container status is {status}")
+                # Get container logs for debugging
+                logs = container.logs().decode("utf-8")
+                logger.debug(f"Container logs:\n{logs}")
+            return status in ["running", "created"]
+        except Exception as e:
+            logger.error(f"Error checking container health: {str(e)}")
+            return False
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
+@pytest.fixture(scope="module")
+def data_volume():
+    tmpdir = TemporaryDirectory()
+    yield tmpdir.name
+    try:
+        # Cleanup the temporary directory using sudo as it contains root files created by the container
+        subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"), check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error cleaning up temporary directory: {str(e)}")
+
+
+@pytest.fixture(scope="module")
+def launcher(data_volume):
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str,
+        test_name: str,
+    ):
+        logger.info(
+            f"Starting docker launcher for model {model_id} and test {test_name}"
+        )
+
+        # Get a random available port
+        def get_free_port():
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.bind(("", 0))
+                s.listen(1)
+                port = s.getsockname()[1]
+            return port
+
+        port = get_free_port()
+        logger.debug(f"Using port {port}")
+
+        client = docker.from_env()
+
+        container_name = f"tgi-gaudi-test-{test_name.replace('/', '-')}"
+
+        try:
+            container = client.containers.get(container_name)
+            logger.info(
+                f"Stopping existing container {container_name} for test {test_name}"
+            )
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+        except Exception as e:
+            logger.error(f"Error handling existing container: {str(e)}")
+
+        model_name = next(
+            name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id
+        )
+
+        tgi_args = TEST_CONFIGS[model_name]["args"].copy()
+
+        env = BASE_ENV.copy()
+
+        # Add model_id to env
+        env["MODEL_ID"] = model_id
+
+        # Add env config that is definied in the fixture parameter
+        if "env_config" in TEST_CONFIGS[model_name]:
+            env.update(TEST_CONFIGS[model_name]["env_config"].copy())
+
+        volumes = [f"{DOCKER_VOLUME}:/data"]
+        logger.debug(f"Using volume {volumes}")
+
+        try:
+            logger.info(f"Creating container with name {container_name}")
+
+            # Log equivalent docker run command for debugging, this is not actually executed
+            container = client.containers.run(
+                DOCKER_IMAGE,
+                command=tgi_args,
+                name=container_name,
+                environment=env,
+                detach=True,
+                volumes=volumes,
+                ports={"80/tcp": port},
+                **HABANA_RUN_ARGS,
+            )
+
+            logger.info(f"Container {container_name} started successfully")
+
+            # Start log streaming in a background thread
+            log_thread = threading.Thread(
+                target=stream_container_logs,
+                args=(container, test_name),
+                daemon=True,  # This ensures the thread will be killed when the main program exits
+            )
+            log_thread.start()
+
+            # Add a small delay to allow container to initialize
+            time.sleep(2)
+
+            # Check container status after creation
+            status = container.status
+            logger.debug(f"Initial container status: {status}")
+            if status not in ["running", "created"]:
+                logs = container.logs().decode("utf-8")
+                logger.error(f"Container failed to start properly. Logs:\n{logs}")
+
+            yield ContainerLauncherHandle(client, container.name, port)
+
+        except Exception as e:
+            logger.error(f"Error starting container: {str(e)}")
+            # Get full traceback for debugging
+            import traceback
+
+            logger.error(f"Full traceback:\n{traceback.format_exc()}")
+            raise
+        finally:
+            try:
+                container = client.containers.get(container_name)
+                logger.info(f"Stopping container {container_name}")
+                container.stop()
+                container.wait()
+
+                container_output = container.logs().decode("utf-8")
+                print(container_output, file=sys.stderr)
+
+                container.remove()
+                logger.info(f"Container {container_name} removed successfully")
+            except NotFound:
+                pass
+            except Exception as e:
+                logger.warning(f"Error cleaning up container: {str(e)}")
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    async def generate_load_inner(
+        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+    ) -> List[Response]:
+        try:
+            futures = [
+                client.generate(
+                    prompt,
+                    max_new_tokens=max_new_tokens,
+                    decoder_input_details=True,
+                )
+                for _ in range(n)
+            ]
+            return await asyncio.gather(*futures)
+        except Exception as e:
+            logger.error(f"Error generating load: {str(e)}")
+            raise
+
+    return generate_load_inner
--- a/backends/gaudi/server/integration-tests/pytest.ini
+++ b/backends/gaudi/server/integration-tests/pytest.ini
@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
--- a/backends/gaudi/server/integration-tests/requirements.txt
+++ b/backends/gaudi/server/integration-tests/requirements.txt
@ -0,0 +1,7 @@
+pytest >= 8.3.5
+pytest-asyncio >= 0.26.0
+docker >= 7.1.0
+Levenshtein >= 0.27.1
+loguru >= 0.7.3
+aiohttp >= 3.11.14
+text-generation
--- a/backends/gaudi/server/integration-tests/test_model.py
+++ b/backends/gaudi/server/integration-tests/test_model.py
@ -0,0 +1,276 @@
+from typing import Any, Dict
+
+from text_generation import AsyncClient
+import pytest
+from Levenshtein import distance as levenshtein_distance
+
+# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
+TEST_CONFIGS = {
+    "meta-llama/Llama-3.1-8B-Instruct-shared": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "args": [
+            "--sharded",
+            "true",
+            "--num-shard",
+            "8",
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "8",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "meta-llama/Llama-3.1-8B-Instruct": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "env_config": {},
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "meta-llama/Llama-2-7b-chat-hf": {
+        "model_id": "meta-llama/Llama-2-7b-chat-hf",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "mistralai/Mistral-7B-Instruct-v0.3": {
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "bigcode/starcoder2-3b": {
+        "model_id": "bigcode/starcoder2-3b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "google/gemma-7b-it": {
+        "model_id": "google/gemma-7b-it",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "Qwen/Qwen2-0.5B-Instruct": {
+        "model_id": "Qwen/Qwen2-0.5B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "tiiuae/falcon-7b-instruct": {
+        "model_id": "tiiuae/falcon-7b-instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "microsoft/phi-1_5": {
+        "model_id": "microsoft/phi-1_5",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "openai-community/gpt2": {
+        "model_id": "openai-community/gpt2",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "facebook/opt-125m": {
+        "model_id": "facebook/opt-125m",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+        "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "EleutherAI/gpt-j-6b": {
+        "model_id": "EleutherAI/gpt-j-6b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+}
+
+print(f"Testing {len(TEST_CONFIGS)} models")
+
+
+@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
+def test_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    test_config = TEST_CONFIGS[request.param]
+    test_config["test_name"] = request.param
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def model_id(test_config):
+    yield test_config["model_id"]
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config):
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def expected_outputs(test_config):
+    return {
+        "greedy": test_config["expected_greedy_output"],
+        # "sampling": model_config["expected_sampling_output"],
+        "batch": test_config["expected_batch_output"],
+    }
+
+
+@pytest.fixture(scope="module")
+def input(test_config):
+    return test_config["input"]
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_id, test_name):
+    with launcher(model_id, test_name) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service) -> AsyncClient:
+    await tgi_service.health(1000)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(
+    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+):
+    # Bounded greedy decoding without input
+    response = await tgi_client.generate(
+        input,
+        max_new_tokens=32,
+    )
+    assert response.details.generated_tokens == 32
+    assert response.generated_text == expected_outputs["greedy"]
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(
+    tgi_client, generate_load, expected_outputs, input
+):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        input,
+        max_new_tokens=32,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = expected_outputs["batch"]
+    for r in responses:
+        assert r.details.generated_tokens == 32
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert levenshtein_distance(r.generated_text, expected) < 3
--- a/backends/gaudi/server/tests/conftest.py
+++ b/backends/gaudi/server/tests/conftest.py
@ -1,23 +0,0 @@
-import pytest
-import os
-from text_generation_server.pb import generate_pb2
-
-os.environ["USE_PREFIX_CACHING"] = "1"
-os.environ["ATTENTION"] = "flashinfer"
-
-
-@pytest.fixture
-def default_pb_parameters():
-    return generate_pb2.NextTokenChooserParameters(
-        temperature=1.0,
-        repetition_penalty=1.0,
-        top_k=0,
-        top_p=1.0,
-        typical_p=1.0,
-        do_sample=False,
-    )
-
-
-@pytest.fixture
-def default_pb_stop_parameters():
-    return generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=10)
--- a/backends/gaudi/server/tests/models/test_bloom.py
+++ b/backends/gaudi/server/tests/models/test_bloom.py
@ -1,363 +0,0 @@
-import pytest
-import torch
-
-from copy import copy
-from transformers import AutoTokenizer
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch
-from text_generation_server.utils import weight_hub_files, download_weights
-from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
-from text_generation_server.models.custom_modeling.bloom_modeling import (
-    BloomForCausalLM,
-)
-
-
-@pytest.fixture(scope="session")
-def default_bloom():
-    model_id = "bigscience/bloom-560m"
-    revision = "main"
-    filenames = weight_hub_files(model_id, revision, ".safetensors")
-    download_weights(filenames, model_id, revision)
-    return BLOOMSharded(
-        model_id,
-        model_class=BloomForCausalLM,
-    )
-
-
-@pytest.fixture(scope="session")
-def bloom_560m_tokenizer():
-    return AutoTokenizer.from_pretrained("bigscience/bloom-560m", padding_side="left")
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="Test",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_bloom_batch(default_pb_batch, bloom_560m_tokenizer):
-    return BloomCausalLMBatch.from_pb(
-        default_pb_batch, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-@pytest.fixture
-def default_multi_requests_bloom_batch(default_pb_request, bloom_560m_tokenizer):
-    req_0 = copy(default_pb_request)
-    req_0.id = 1
-    req_1 = default_pb_request
-    req_1.id = 2
-    req_1.stopping_parameters.max_new_tokens = 5
-
-    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
-    return BloomCausalLMBatch.from_pb(
-        batch_pb, bloom_560m_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-def test_batch_from_pb(default_pb_batch, default_bloom_batch):
-    batch = default_bloom_batch
-
-    assert batch.batch_id == default_pb_batch.id
-    assert batch.requests == default_pb_batch.requests
-
-    assert len(batch.input_ids) == default_pb_batch.size
-    assert batch.input_ids[0][-1] == 10264
-    assert torch.all(batch.input_ids[0][:-1] == 3)
-
-    assert batch.attention_mask[0][0] == 1
-    assert torch.all(batch.attention_mask[0][1:] == 0)
-
-    assert batch.past_key_values is None
-
-    assert all(
-        [
-            torch.equal(input_ids, all_input_ids[:, 0])
-            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
-        ]
-    )
-
-    assert batch.input_lengths == [1]
-
-    assert len(batch) == default_pb_batch.size
-    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
-
-    assert batch.max_input_length == batch.input_lengths[0]
-
-
-def test_batch_concatenate_no_prefill(default_bloom_batch):
-    with pytest.raises(ValueError):
-        BloomCausalLMBatch.concatenate([default_bloom_batch, default_bloom_batch])
-
-
-def test_causal_lm_batch_type(default_bloom):
-    assert default_bloom.batch_type == BloomCausalLMBatch
-
-
-def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
-    sequence_length = len(default_bloom_batch.all_input_ids[0])
-    generations, next_batch, _ = default_bloom.generate_token(default_bloom_batch)
-
-    assert len(generations) == len(default_bloom_batch)
-    assert isinstance(next_batch, CausalLMBatch)
-    assert not next_batch.keys_head_dim_last
-
-    assert len(next_batch.all_input_ids) == len(next_batch)
-    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
-    assert len(next_batch.attention_mask[0]) == 11
-    assert torch.all(next_batch.all_input_ids[0][-2:] == 10264)
-    assert torch.all(next_batch.all_input_ids[0][:-2] == 3)
-
-    assert torch.all(next_batch.attention_mask[0][:2] == 1)
-    assert torch.all(next_batch.attention_mask[0][2:] == 0)
-
-    assert next_batch.input_ids.shape == (len(next_batch), 1)
-    assert next_batch.input_ids[0, 0] == 10264
-
-    assert next_batch.input_lengths == [2]
-    assert next_batch.max_input_length == next_batch.input_lengths[0]
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (16, 64, sequence_length) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (16, sequence_length, 64) for p in next_batch.past_key_values]
-    )
-    assert all([generation.generated_text is None for generation in generations])
-    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all(
-        [
-            token_id.item() == 10264
-            for generation in generations
-            for token_id in generation.tokens.token_ids
-        ]
-    )
-    assert all(
-        [
-            token_text == "Test"
-            for generation in generations
-            for token_text in generation.tokens.texts
-        ]
-    )
-    assert generations[0].request_id == 0
-
-
-def test_causal_lm_generate_token_completion(default_bloom, default_bloom_batch):
-    next_batch = default_bloom_batch
-    for _ in range(default_bloom_batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(default_bloom_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert generations[0].request_id == default_bloom_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_causal_lm_generate_token_completion_multi(
-    default_bloom, default_multi_requests_bloom_batch
-):
-    next_batch = default_multi_requests_bloom_batch
-
-    for i in range(
-        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(default_multi_requests_bloom_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[1].generated_text.text == "TestTestTestTestTest"
-    assert (
-        generations[1].request_id == default_multi_requests_bloom_batch.requests[1].id
-    )
-    assert (
-        generations[1].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-    )
-    # Copy stopping_criterias before filtering
-    stopping_criterias = default_multi_requests_bloom_batch.stopping_criterias.copy()
-
-    next_batch = next_batch.filter([next_batch.requests[0].id])
-
-    for _ in range(
-        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert (
-        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_batch_concatenate(
-    default_bloom, default_bloom_batch, default_multi_requests_bloom_batch
-):
-    next_batch_0 = default_bloom_batch
-    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
-    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
-
-    next_batch_1 = default_multi_requests_bloom_batch
-    _, next_batch_1, _ = default_bloom.generate_token(next_batch_1)
-
-    # Clone past_key_values before concatenating to compare after,
-    # because they are removed from the concatenated batches
-    next_batch_0_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
-    ]
-    next_batch_1_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
-    ]
-
-    next_batch = BloomCausalLMBatch.concatenate([next_batch_0, next_batch_1])
-
-    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
-
-    assert torch.all(
-        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(
-        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
-
-    assert next_batch.batch_id == 0
-    assert torch.all(next_batch.input_ids == 10264)
-
-    assert next_batch.input_lengths == [3, 2, 2]
-    assert next_batch.max_input_length == 3
-
-    assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == list(next_batch_1.requests)
-
-    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
-    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
-
-    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
-    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
-
-    assert next_batch.past_key_values is not None
-    assert all([p[0].shape == (3, 16, 64, 2) for p in next_batch.past_key_values])
-    assert all([p[1].shape == (3, 16, 2, 64) for p in next_batch.past_key_values])
-
-    for i, past in enumerate(next_batch.past_key_values):
-        assert torch.equal(next_batch_0_past_key_values[i][0][:, :, -2:], past[0][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][0][:, :, -1:],
-            past[0][1:, :, :, -1].reshape(-1, 64, 1),
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][1][:, -2:, :], past[1][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][1][:, -1:, :],
-            past[1][1:, :, -1, :].reshape(-1, 1, 64),
-        )
-
-    for _ in range(
-        default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 2
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 3
-    assert generations[2].generated_text.text == "TestTestTestTestTest"
-    assert (
-        generations[2].request_id == default_multi_requests_bloom_batch.requests[1].id
-    )
-    assert (
-        generations[2].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-    )
-
-    next_batch = next_batch.filter(
-        [next_batch.requests[0].id, next_batch.requests[1].id]
-    )
-
-    for _ in range(
-        default_bloom_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-        - 2
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert generations[0].request_id == default_bloom_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
-
-    next_batch = next_batch.filter([next_batch.requests[1].id])
-
-    for _ in range(
-        default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
-        - default_bloom_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
-        - 4
-    ):
-        generations, next_batch, _ = default_bloom.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_bloom.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
-    )
-    assert (
-        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_bloom_batch.stopping_criterias[0].max_new_tokens
-    )
--- a/backends/gaudi/server/tests/models/test_causal_lm.py
+++ b/backends/gaudi/server/tests/models/test_causal_lm.py
@ -1,354 +0,0 @@
-import pytest
-import torch
-
-from copy import copy
-from transformers import AutoTokenizer
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
-
-
-@pytest.fixture(scope="session")
-def default_causal_lm():
-    return CausalLM.fallback("gpt2")
-
-
-@pytest.fixture(scope="session")
-def gpt2_tokenizer():
-    tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
-    tokenizer.pad_token_id = 50256
-    return tokenizer
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="Test",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_causal_lm_batch(default_pb_batch, gpt2_tokenizer):
-    return CausalLMBatch.from_pb(
-        default_pb_batch, gpt2_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-@pytest.fixture
-def default_multi_requests_causal_lm_batch(default_pb_request, gpt2_tokenizer):
-    req_0 = copy(default_pb_request)
-    req_0.id = 1
-    req_1 = default_pb_request
-    req_1.id = 2
-    req_1.stopping_parameters.max_new_tokens = 5
-
-    batch_pb = generate_pb2.Batch(id=1, requests=[req_0, req_1], size=2)
-    return CausalLMBatch.from_pb(
-        batch_pb, gpt2_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
-    batch = default_causal_lm_batch
-
-    assert batch.batch_id == default_pb_batch.id
-    assert batch.requests == default_pb_batch.requests
-
-    assert len(batch.input_ids) == default_pb_batch.size
-    assert batch.input_ids[0][-1] == 14402
-    assert torch.all(batch.input_ids[0][:-1] == 50256)
-
-    assert batch.attention_mask[0, 0] == 1
-    assert torch.all(batch.attention_mask[0, 1:] == 0)
-
-    assert batch.past_key_values is None
-
-    assert all(
-        [
-            torch.equal(input_ids, all_input_ids[:, 0])
-            for input_ids, all_input_ids in zip(batch.input_ids, batch.all_input_ids)
-        ]
-    )
-
-    assert batch.input_lengths == [1]
-
-    assert len(batch) == default_pb_batch.size
-    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
-
-    assert batch.max_input_length == batch.input_lengths[0]
-
-
-def test_batch_concatenate_no_prefill(default_causal_lm_batch):
-    with pytest.raises(ValueError):
-        CausalLMBatch.concatenate([default_causal_lm_batch, default_causal_lm_batch])
-
-
-def test_causal_lm_batch_type(default_causal_lm):
-    assert default_causal_lm.batch_type == CausalLMBatch
-
-
-def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
-    sequence_length = len(default_causal_lm_batch.all_input_ids[0])
-    generations, next_batch, _ = default_causal_lm.generate_token(
-        default_causal_lm_batch
-    )
-
-    assert len(generations) == len(next_batch)
-    assert isinstance(next_batch, CausalLMBatch)
-
-    assert len(next_batch.all_input_ids) == len(next_batch)
-    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
-    assert len(next_batch.attention_mask[0]) == 11
-    assert next_batch.all_input_ids[0][-1] == 13
-    assert next_batch.all_input_ids[0][-2] == 14402
-    assert torch.all(next_batch.all_input_ids[0][:-2] == 50256)
-
-    assert torch.all(next_batch.attention_mask[0][0:2] == 1)
-    assert torch.all(next_batch.attention_mask[0][2:] == 0)
-
-    assert next_batch.input_ids.shape == (len(next_batch), 1)
-    assert next_batch.input_ids[0, 0] == 13
-
-    assert next_batch.input_lengths == [2]
-    assert next_batch.max_input_length == next_batch.input_lengths[0]
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (1, 12, sequence_length, 64) for p in next_batch.past_key_values]
-    )
-    assert all([generation.generated_text is None for generation in generations])
-    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all(
-        [
-            token_id.item() == 13
-            for generation in generations
-            for token_id in generation.tokens.token_ids
-        ]
-    )
-    assert all(
-        [
-            token_text == "."
-            for generation in generations
-            for token_text in generation.tokens.texts
-        ]
-    )
-    assert generations[0].request_id == 0
-
-
-def test_causal_lm_generate_token_completion(
-    default_causal_lm, default_causal_lm_batch
-):
-    next_batch = default_causal_lm_batch
-    for _ in range(default_causal_lm_batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_causal_lm_generate_token_completion_multi(
-    default_causal_lm, default_multi_requests_causal_lm_batch
-):
-    next_batch = default_multi_requests_causal_lm_batch
-
-    for i in range(
-        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[1].generated_text.text == ".java:784)"
-    assert (
-        generations[1].request_id
-        == default_multi_requests_causal_lm_batch.requests[1].id
-    )
-    assert (
-        generations[1].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-    )
-    # Copy stopping_criterias before filtering
-    stopping_criterias = (
-        default_multi_requests_causal_lm_batch.stopping_criterias.copy()
-    )
-
-    next_batch = next_batch.filter([next_batch.requests[0].id])
-
-    for _ in range(
-        stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert (
-        generations[0].request_id
-        == default_multi_requests_causal_lm_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-def test_batch_concatenate(
-    default_causal_lm, default_causal_lm_batch, default_multi_requests_causal_lm_batch
-):
-    next_batch_0 = default_causal_lm_batch
-    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
-    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
-
-    next_batch_1 = default_multi_requests_causal_lm_batch
-    _, next_batch_1, _ = default_causal_lm.generate_token(next_batch_1)
-
-    # Clone past_key_values before concatenating to compare after,
-    # because they are removed from the concatenated batches
-    next_batch_0_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_0.past_key_values
-    ]
-    next_batch_1_past_key_values = [
-        (k.clone(), v.clone()) for (k, v) in next_batch_1.past_key_values
-    ]
-
-    next_batch = CausalLMBatch.concatenate([next_batch_0, next_batch_1])
-
-    assert torch.equal(next_batch.all_input_ids[0], next_batch_0.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
-    assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
-
-    assert torch.all(
-        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(
-        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
-    )
-    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
-
-    assert next_batch.batch_id == 0
-    assert next_batch.input_ids[0, 0] == 12355
-    assert torch.all(next_batch.input_ids[1:] == 13)
-
-    assert next_batch.input_lengths == [3, 2, 2]
-    assert next_batch.max_input_length == 3
-
-    assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == list(next_batch_1.requests)
-
-    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
-    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
-
-    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
-    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
-
-    assert next_batch.past_key_values is not None
-    assert all([p[0].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
-    assert all([p[1].shape == (3, 12, 2, 64) for p in next_batch.past_key_values])
-
-    for i, past in enumerate(next_batch.past_key_values):
-        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:], past[0][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][0][:, :, -1:], past[0][1:, :, -1:, :]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:], past[1][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][1][:, :, -1:], past[1][1:, :, -1:, :]
-        )
-
-    for _ in range(
-        default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 2
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 3
-    assert generations[2].generated_text.text == ".java:784)"
-    assert (
-        generations[2].request_id
-        == default_multi_requests_causal_lm_batch.requests[1].id
-    )
-    assert (
-        generations[2].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-    )
-
-    next_batch = next_batch.filter(
-        [next_batch.requests[0].id, next_batch.requests[1].id]
-    )
-
-    for _ in range(
-        default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-        - 2
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert generations[0].request_id == default_causal_lm_batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
-
-    next_batch = next_batch.filter([next_batch.requests[1].id])
-
-    for _ in range(
-        default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
-        - default_causal_lm_batch.stopping_criterias[0].max_new_tokens
-        - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
-        - 4
-    ):
-        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == ".java:784) at net.minecraft."
-    assert (
-        generations[0].request_id
-        == default_multi_requests_causal_lm_batch.requests[0].id
-    )
-    assert (
-        generations[0].generated_text.generated_tokens
-        == default_multi_requests_causal_lm_batch.stopping_criterias[0].max_new_tokens
-    )
--- a/backends/gaudi/server/tests/models/test_model.py
+++ b/backends/gaudi/server/tests/models/test_model.py
@ -1,83 +0,0 @@
-import pytest
-import torch
-
-from transformers import AutoTokenizer
-
-from text_generation_server.models import Model
-
-
-def get_test_model():
-    class TestModel(Model):
-        def batch_type(self):
-            raise NotImplementedError
-
-        def generate_token(self, batch):
-            raise NotImplementedError
-
-    tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")
-
-    model = TestModel(
-        "test_model_id",
-        torch.nn.Linear(1, 1),
-        tokenizer,
-        False,
-        torch.float32,
-        torch.device("cpu"),
-    )
-    return model
-
-
-@pytest.mark.private
-def test_decode_streaming_english_spaces():
-    model = get_test_model()
-    truth = "Hello here, this is a simple test"
-    all_input_ids = [15043, 1244, 29892, 445, 338, 263, 2560, 1243]
-    assert (
-        all_input_ids == model.tokenizer(truth, add_special_tokens=False)["input_ids"]
-    )
-
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    for i in range(len(all_input_ids)):
-        text, offset, token_offset = model.decode_token(
-            all_input_ids[: i + 1], offset, token_offset
-        )
-        decoded_text += text
-
-    assert decoded_text == truth
-
-
-@pytest.mark.private
-def test_decode_streaming_chinese_utf8():
-    model = get_test_model()
-    truth = "我很感谢你的热情"
-    all_input_ids = [
-        30672,
-        232,
-        193,
-        139,
-        233,
-        135,
-        162,
-        235,
-        179,
-        165,
-        30919,
-        30210,
-        234,
-        134,
-        176,
-        30993,
-    ]
-
-    decoded_text = ""
-    offset = 0
-    token_offset = 0
-    for i in range(len(all_input_ids)):
-        text, offset, token_offset = model.decode_token(
-            all_input_ids[: i + 1], offset, token_offset
-        )
-        decoded_text += text
-
-    assert decoded_text == truth
--- a/backends/gaudi/server/tests/models/test_santacoder.py
+++ b/backends/gaudi/server/tests/models/test_santacoder.py
@ -1,108 +0,0 @@
-import pytest
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM
-
-
-@pytest.fixture(scope="session")
-def default_santacoder():
-    return CausalLM.fallback(model_id="bigcode/santacoder")
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="def",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="def")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_fim_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="<fim-prefix>def<fim-suffix>world<fim-middle>",
-        input_chunks=generate_pb2.Input(
-            chunks=[
-                generate_pb2.InputChunk(
-                    text="<fim-prefix>def<fim-suffix>world<fim-middle>"
-                )
-            ]
-        ),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_fim_pb_batch(default_fim_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_fim_pb_request], size=1)
-
-
-@pytest.mark.skip
-def test_santacoder_generate_token_completion(default_santacoder, default_pb_batch):
-    batch = CausalLMBatch.from_pb(
-        default_pb_batch,
-        default_santacoder.tokenizer,
-        default_santacoder.dtype,
-        default_santacoder.device,
-    )
-    next_batch = batch
-
-    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == " test_get_all_users_with_"
-    assert generations[0].request_id == batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == batch.stopping_criterias[0].max_new_tokens
-    )
-
-
-@pytest.mark.skip
-def test_fim_santacoder_generate_token_completion(
-    default_santacoder, default_fim_pb_batch
-):
-    batch = CausalLMBatch.from_pb(
-        default_fim_pb_batch,
-        default_santacoder.tokenizer,
-        default_santacoder.dtype,
-        default_santacoder.device,
-    )
-    next_batch = batch
-
-    for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert (
-        generations[0].generated_text.text
-        == """ineProperty(exports, "__esModule", { value"""
-    )
-    assert generations[0].request_id == batch.requests[0].id
-    assert (
-        generations[0].generated_text.generated_tokens
-        == batch.stopping_criterias[0].max_new_tokens
-    )
--- a/backends/gaudi/server/tests/models/test_seq2seq_lm.py
+++ b/backends/gaudi/server/tests/models/test_seq2seq_lm.py
@ -1,365 +0,0 @@
-import pytest
-import torch
-
-from copy import copy
-
-from transformers import AutoTokenizer
-
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.seq2seq_lm import Seq2SeqLM, Seq2SeqLMBatch
-
-
-@pytest.fixture(scope="session")
-def mt0_small_tokenizer():
-    tokenizer = AutoTokenizer.from_pretrained(
-        "bigscience/mt0-small", padding_side="left"
-    )
-    tokenizer.bos_token_id = 0
-    return tokenizer
-
-
-@pytest.fixture(scope="session")
-def default_seq2seq_lm():
-    return Seq2SeqLM.fallback("bigscience/mt0-small")
-
-
-@pytest.fixture
-def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
-    return generate_pb2.Request(
-        id=0,
-        inputs="Test",
-        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
-        prefill_logprobs=True,
-        truncate=100,
-        parameters=default_pb_parameters,
-        stopping_parameters=default_pb_stop_parameters,
-    )
-
-
-@pytest.fixture
-def default_pb_batch(default_pb_request):
-    return generate_pb2.Batch(id=0, requests=[default_pb_request], size=1)
-
-
-@pytest.fixture
-def default_seq2seq_lm_batch(default_pb_batch, mt0_small_tokenizer):
-    return Seq2SeqLMBatch.from_pb(
-        default_pb_batch, mt0_small_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-@pytest.fixture
-def default_multi_requests_seq2seq_lm_batch(default_pb_request, mt0_small_tokenizer):
-    req_0 = copy(default_pb_request)
-    req_0.id = 1
-    req_1 = default_pb_request
-    req_1.id = 2
-    req_1.stopping_parameters.max_new_tokens = 5
-
-    batch_pb = generate_pb2.Batch(id=0, requests=[req_0, req_1], size=2)
-    return Seq2SeqLMBatch.from_pb(
-        batch_pb, mt0_small_tokenizer, torch.float32, torch.device("cpu")
-    )
-
-
-def test_batch_from_pb(default_pb_batch, default_seq2seq_lm_batch):
-    batch = default_seq2seq_lm_batch
-    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
-
-    assert batch.batch_id == default_pb_batch.id
-    assert batch.requests == default_pb_batch.requests
-
-    assert batch.input_ids.shape == (default_pb_batch.size, sequence_length)
-    assert batch.input_ids[0][-2] == 4268
-    assert batch.input_ids[0][-1] == 1
-    assert torch.all(batch.input_ids[0][:-2] == 0)
-
-    assert torch.all(batch.attention_mask[0][-2:] == 1)
-    assert torch.all(batch.attention_mask[0][:-2] == 0)
-
-    assert len(batch.decoder_input_ids) == default_pb_batch.size
-    assert batch.decoder_attention_mask is None
-    assert batch.encoder_last_hidden_state is None
-
-    assert batch.past_key_values is None
-
-    assert batch.input_lengths == [2]
-    assert batch.decoder_input_lengths == [1]
-
-    assert len(batch) == default_pb_batch.size
-    assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == len(batch)
-
-    assert batch.max_input_length == batch.input_lengths[0]
-    assert batch.max_decoder_input_length == batch.decoder_input_lengths[0]
-
-
-def test_batch_concatenate_no_prefill(default_seq2seq_lm_batch):
-    with pytest.raises(ValueError):
-        Seq2SeqLMBatch.concatenate([default_seq2seq_lm_batch, default_seq2seq_lm_batch])
-
-
-def test_seq2seq_lm_batch_type(default_seq2seq_lm):
-    assert default_seq2seq_lm.batch_type == Seq2SeqLMBatch
-
-
-def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch):
-    sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(
-        default_seq2seq_lm_batch
-    )
-
-    assert len(generations) == len(next_batch)
-    assert isinstance(next_batch, Seq2SeqLMBatch)
-
-    assert next_batch.input_ids is None
-    assert torch.equal(
-        next_batch.attention_mask, default_seq2seq_lm_batch.attention_mask
-    )
-    assert next_batch.input_lengths == default_seq2seq_lm_batch.input_lengths
-    assert next_batch.max_input_length == default_seq2seq_lm_batch.max_input_length
-    assert (
-        next_batch.next_token_choosers == default_seq2seq_lm_batch.next_token_choosers
-    )
-    assert next_batch.stopping_criterias == default_seq2seq_lm_batch.stopping_criterias
-
-    assert len(next_batch.decoder_input_ids) == len(next_batch)
-    assert next_batch.all_decoder_input_ids[0][0] == 0
-    assert next_batch.all_decoder_input_ids[0][1] == 259
-    assert next_batch.decoder_attention_mask is None
-    assert next_batch.encoder_last_hidden_state.shape == (1, sequence_length, 512)
-
-    assert next_batch.decoder_input_lengths == [2]
-    assert next_batch.max_decoder_input_length == 2
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (len(next_batch), 6, 1, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [
-            p[2].shape == (len(next_batch), 6, sequence_length, 64)
-            for p in next_batch.past_key_values
-        ]
-    )
-    assert all(
-        [
-            p[3].shape == (len(next_batch), 6, sequence_length, 64)
-            for p in next_batch.past_key_values
-        ]
-    )
-    assert all([generation.generated_text is None for generation in generations])
-    assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all(
-        [
-            token_id.item() == 259
-            for generation in generations
-            for token_id in generation.tokens.token_ids
-        ]
-    )
-    assert all(
-        [
-            token_text == " "
-            for generation in generations
-            for token_text in generation.tokens.texts
-        ]
-    )
-    assert generations[0].request_id == 0
-
-
-def test_seq2seq_lm_generate_token_completion(
-    default_seq2seq_lm, default_seq2seq_lm_batch
-):
-    next_batch = default_seq2seq_lm_batch
-    for _ in range(6):
-        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == "a few weeks"
-    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
-    assert generations[0].generated_text.generated_tokens == 7
-
-
-def test_seq2seq_lm_generate_token_completion_multi(
-    default_seq2seq_lm, default_multi_requests_seq2seq_lm_batch
-):
-    next_batch = default_multi_requests_seq2seq_lm_batch
-
-    for i in range(4):
-        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[1].generated_text.text == "a few "
-    assert (
-        generations[1].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[1].id
-    )
-    assert generations[1].generated_text.generated_tokens == 5
-
-    next_batch = next_batch.filter([next_batch.requests[0].id])
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == "a few weeks"
-    assert (
-        generations[0].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[0].id
-    )
-    assert generations[0].generated_text.generated_tokens == 7
-
-
-def test_batch_concatenate(
-    default_seq2seq_lm,
-    default_seq2seq_lm_batch,
-    default_multi_requests_seq2seq_lm_batch,
-):
-    next_batch_0 = default_seq2seq_lm_batch
-    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
-    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
-
-    next_batch_1 = default_multi_requests_seq2seq_lm_batch
-    _, next_batch_1, _ = default_seq2seq_lm.generate_token(next_batch_1)
-
-    # Copy hidden state because it is removed from the concatenated branches
-    next_batch_0_encoder_last_hidden_state = next_batch_0.encoder_last_hidden_state
-    next_batch_1_encoder_last_hidden_state = next_batch_1.encoder_last_hidden_state
-
-    # Clone past_key_values before concatenating to compare after,
-    # because they are removed from the concatenated batches
-    next_batch_0_past_key_values = [
-        [t.clone() for t in layer] for layer in next_batch_0.past_key_values
-    ]
-    next_batch_1_past_key_values = [
-        [t.clone() for t in layer] for layer in next_batch_1.past_key_values
-    ]
-
-    next_batch = Seq2SeqLMBatch.concatenate([next_batch_0, next_batch_1])
-
-    assert next_batch.batch_id == 0
-
-    assert torch.equal(
-        next_batch.decoder_input_ids[0], next_batch_0.decoder_input_ids[0]
-    )
-    assert next_batch.all_decoder_input_ids[1][0] == 0
-    assert next_batch.all_decoder_input_ids[2][0] == 0
-    assert torch.equal(
-        next_batch.decoder_input_ids[1:, -2:], next_batch_1.decoder_input_ids
-    )
-
-    assert torch.all(next_batch.decoder_attention_mask[0, :3] == 1)
-    assert torch.all(next_batch.decoder_attention_mask[0, 3:] == 0)
-    assert torch.all(next_batch.decoder_attention_mask[1:, 0] == 0)
-    assert torch.all(next_batch.decoder_attention_mask[1:, 1:3] == 1)
-
-    assert torch.equal(
-        next_batch.encoder_last_hidden_state[0],
-        next_batch_0_encoder_last_hidden_state[0, -2:],
-    )
-    assert torch.equal(
-        next_batch.encoder_last_hidden_state[1:],
-        next_batch_1_encoder_last_hidden_state[:, -2:],
-    )
-
-    assert next_batch.input_lengths == [2, 2, 2]
-    assert next_batch.decoder_input_lengths == [3, 2, 2]
-    assert next_batch.max_input_length == 2
-    assert next_batch.max_decoder_input_length == 3
-
-    assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == list(next_batch_1.requests)
-
-    assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
-    assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
-
-    assert next_batch.stopping_criterias[0] == next_batch_0.stopping_criterias[0]
-    assert next_batch.stopping_criterias[1:] == next_batch_1.stopping_criterias
-
-    assert next_batch.past_key_values is not None
-    assert all(
-        [p[0].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[1].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[2].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-    assert all(
-        [p[3].shape == (len(next_batch), 6, 2, 64) for p in next_batch.past_key_values]
-    )
-
-    for i, past in enumerate(next_batch.past_key_values):
-        assert torch.equal(next_batch_0_past_key_values[i][0][0, :, -2:, :], past[0][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][0][:, :, -1:, :], past[0][1:, :, -1:, :]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][1][0, :, -2:, :], past[1][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][1][:, :, -1:, :], past[1][1:, :, -1:, :]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][2][0, :, -2:, :], past[2][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][2][:, :, -2:, :], past[2][1:]
-        )
-
-        assert torch.equal(next_batch_0_past_key_values[i][3][0, :, -2:, :], past[3][0])
-        assert torch.equal(
-            next_batch_1_past_key_values[i][3][:, :, -2:, :], past[3][1:]
-        )
-
-    for _ in range(3):
-        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-        assert len(generations) == len(next_batch)
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 3
-    assert generations[2].generated_text.text == "a few "
-    assert (
-        generations[2].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[1].id
-    )
-    assert generations[2].generated_text.generated_tokens == 5
-
-    next_batch = next_batch.filter(
-        [next_batch.requests[0].id, next_batch.requests[1].id]
-    )
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is not None
-
-    assert len(generations) == 2
-    assert generations[0].generated_text.text == "a few weeks"
-    assert generations[0].request_id == default_seq2seq_lm_batch.requests[0].id
-    assert generations[0].generated_text.generated_tokens == 7
-
-    next_batch = next_batch.filter([next_batch.requests[1].id])
-
-    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
-    assert next_batch is None
-
-    assert len(generations) == 1
-    assert generations[0].generated_text.text == "a few weeks"
-    assert (
-        generations[0].request_id
-        == default_multi_requests_seq2seq_lm_batch.requests[0].id
-    )
-    assert generations[0].generated_text.generated_tokens == 7
--- a/backends/gaudi/server/tests/utils/test_adapter.py
+++ b/backends/gaudi/server/tests/utils/test_adapter.py
@ -1,247 +0,0 @@
-import pytest
-from unittest.mock import Mock
-from text_generation_server.utils.adapter import (
-    get_attn_weights,
-    get_mlp_weights,
-    parse_lora_adapters,
-    AdapterInfo,
-)
-
-
-def test_parse_lora_adapters_empty():
-    assert parse_lora_adapters(None) == []
-    assert parse_lora_adapters("") == []
-
-
-def test_parse_lora_adapters_single():
-    result = parse_lora_adapters("adapter1")
-    assert result == [AdapterInfo(id="adapter1", path=None, revision=None)]
-
-
-def test_parse_lora_adapters_with_path():
-    result = parse_lora_adapters("adapter1=path/to/adapter1")
-    assert result == [
-        AdapterInfo(id="adapter1", path="path/to/adapter1", revision=None)
-    ]
-
-
-def test_parse_lora_adapters_with_path_and_revision():
-    result = parse_lora_adapters("adapter1=path/to/adapter1@main")
-    assert result == [
-        AdapterInfo(id="adapter1", path="path/to/adapter1", revision="main")
-    ]
-
-
-def test_parse_lora_adapters_multiple():
-    result = parse_lora_adapters(
-        "adapter1,adapter2=path/to/adapter2,adapter3=path/to/adapter3@dev"
-    )
-    assert result == [
-        AdapterInfo(id="adapter1", path=None, revision=None),
-        AdapterInfo(id="adapter2", path="path/to/adapter2", revision=None),
-        AdapterInfo(id="adapter3", path="path/to/adapter3", revision="dev"),
-    ]
-
-
-def test_parse_lora_adapters_invalid_format():
-    try:
-        parse_lora_adapters("adapter1,invalid=format=test,adapter3")
-        assert False, "Should have raised ValueError"
-    except ValueError as e:
-        assert str(e) == "Invalid LoRA adapter format: invalid=format=test"
-
-
-def test_get_attn_weights():
-    # create a mock layer
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    # call the function
-    result = get_attn_weights(2, mock_layer)
-
-    # assert the result
-    expected = {
-        (2, "q_proj"): (
-            "model.layers.2.self_attn.q_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "k_proj"): (
-            "model.layers.2.self_attn.k_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "qkv_proj"): (
-            "model.layers.2.self_attn.qkv_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "v_proj"): (
-            "model.layers.2.self_attn.v_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_with_gate_up_proj():
-    # create a mock layer with gate_up_proj
-    mock_layer = Mock()
-    mock_layer.mlp.gate_up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    # call the function
-    result = get_mlp_weights(3, mock_layer)
-
-    # assert the result
-    expected = {
-        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
-        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
-        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_without_gate_up_proj():
-    # create a mock layer without gate_up_proj
-    mock_layer = Mock()
-    mock_layer.mlp = Mock(spec=[])
-
-    # call the function
-    result = get_mlp_weights(1, mock_layer)
-
-    # assert the result
-    assert result == {}
-
-
-@pytest.mark.parametrize("layer_index", [0, 1, 5])
-def test_get_attn_weights_different_layers(layer_index):
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    result = get_attn_weights(layer_index, mock_layer)
-
-    for k in ["q", "k", "v"]:
-        assert (layer_index, f"{k}_proj") in result
-        assert (
-            result[(layer_index, f"{k}_proj")][0]
-            == f"model.layers.{layer_index}.self_attn.{k}_proj"
-        )
-
-    assert (layer_index, "o_proj") in result
-    assert (
-        result[(layer_index, "o_proj")][0]
-        == f"model.layers.{layer_index}.self_attn.o_proj"
-    )
-
-
-@pytest.mark.parametrize("layer_index", [0, 1, 5])
-def test_get_mlp_weights_different_layers(layer_index):
-    mock_layer = Mock()
-    mock_layer.mlp.gate_up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    result = get_mlp_weights(layer_index, mock_layer)
-
-    for k in ["gate", "up", "down"]:
-        assert (layer_index, f"{k}_proj") in result
-        assert (
-            result[(layer_index, f"{k}_proj")][0]
-            == f"model.layers.{layer_index}.mlp.{k}_proj"
-        )
-
-
-def test_get_attn_weights_llama_compatibility():
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    result = get_attn_weights(2, mock_layer)
-
-    expected = {
-        (2, "q_proj"): (
-            "model.layers.2.self_attn.q_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "k_proj"): (
-            "model.layers.2.self_attn.k_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "qkv_proj"): (
-            "model.layers.2.self_attn.qkv_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "v_proj"): (
-            "model.layers.2.self_attn.v_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_llama_compatibility():
-    mock_layer = Mock()
-    mock_layer.mlp.gate_up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    result = get_mlp_weights(3, mock_layer)
-
-    expected = {
-        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_up_proj),
-        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.gate_up_proj),
-        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
-    }
-    assert result == expected
-
-
-def test_get_attn_weights_gemma_compatibility():
-    mock_layer = Mock()
-    mock_layer.self_attn.query_key_value = Mock()
-    mock_layer.self_attn.o_proj = Mock()
-
-    result = get_attn_weights(2, mock_layer)
-
-    expected = {
-        (2, "q_proj"): (
-            "model.layers.2.self_attn.q_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "k_proj"): (
-            "model.layers.2.self_attn.k_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "qkv_proj"): (
-            "model.layers.2.self_attn.qkv_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "v_proj"): (
-            "model.layers.2.self_attn.v_proj",
-            mock_layer.self_attn.query_key_value,
-        ),
-        (2, "o_proj"): ("model.layers.2.self_attn.o_proj", mock_layer.self_attn.o_proj),
-    }
-    assert result == expected
-
-
-def test_get_mlp_weights_gemma_compatibility():
-    mock_layer = Mock()
-    mock_layer.mlp.gate_proj = Mock()
-    mock_layer.mlp.up_proj = Mock()
-    mock_layer.mlp.down_proj = Mock()
-
-    # ensure that the mock_layer.mlp.gate_up_proj attribute does not exist.
-    # This is necessary because the use of `Mock` automatically creates any
-    # attributes that are accessed, even if they don't exist in the actual
-    # implementation. If `gate_up_proj` were created, `get_mlp_weights` might
-    # follow the wrong execution path and return an incorrect result.
-    del mock_layer.mlp.gate_up_proj
-
-    result = get_mlp_weights(3, mock_layer)
-
-    expected = {
-        (3, "gate_proj"): ("model.layers.3.mlp.gate_proj", mock_layer.mlp.gate_proj),
-        (3, "up_proj"): ("model.layers.3.mlp.up_proj", mock_layer.mlp.up_proj),
-        (3, "down_proj"): ("model.layers.3.mlp.down_proj", mock_layer.mlp.down_proj),
-    }
-    assert result == expected
--- a/backends/gaudi/server/tests/utils/test_convert.py
+++ b/backends/gaudi/server/tests/utils/test_convert.py
@ -1,21 +0,0 @@
-from text_generation_server.utils.hub import (
-    download_weights,
-    weight_hub_files,
-    weight_files,
-)
-
-from text_generation_server.utils.convert import convert_files
-
-
-def test_convert_files():
-    model_id = "bigscience/bloom-560m"
-    pt_filenames = weight_hub_files(model_id, extension=".bin")
-    local_pt_files = download_weights(pt_filenames, model_id)
-    local_st_files = [
-        p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors" for p in local_pt_files
-    ]
-    convert_files(local_pt_files, local_st_files, discard_names=[])
-
-    found_st_files = weight_files(model_id)
-
-    assert all([p in found_st_files for p in local_st_files])
--- a/backends/gaudi/server/tests/utils/test_hub.py
+++ b/backends/gaudi/server/tests/utils/test_hub.py
@ -1,103 +0,0 @@
-import os
-import tempfile
-
-import pytest
-
-import huggingface_hub.constants
-
-import text_generation_server.utils.hub
-from text_generation_server.utils.hub import (
-    weight_hub_files,
-    download_weights,
-    weight_files,
-    EntryNotFoundError,
-    LocalEntryNotFoundError,
-    RevisionNotFoundError,
-)
-
-
-@pytest.fixture()
-def offline():
-    current_value = text_generation_server.utils.hub.HF_HUB_OFFLINE
-    text_generation_server.utils.hub.HF_HUB_OFFLINE = True
-    yield "offline"
-    text_generation_server.utils.hub.HF_HUB_OFFLINE = current_value
-
-
-@pytest.fixture()
-def fresh_cache():
-    with tempfile.TemporaryDirectory() as d:
-        current_value = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE
-        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = d
-        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = d
-        os.environ["HUGGINGFACE_HUB_CACHE"] = d
-        yield
-        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = current_value
-        os.environ["HUGGINGFACE_HUB_CACHE"] = current_value
-        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = current_value
-
-
-@pytest.fixture()
-def prefetched():
-    model_id = "bert-base-uncased"
-    huggingface_hub.snapshot_download(
-        repo_id=model_id,
-        revision="main",
-        local_files_only=False,
-        repo_type="model",
-        allow_patterns=["*.safetensors"],
-    )
-    yield model_id
-
-
-def test_weight_hub_files_offline_error(offline, fresh_cache):
-    # If the model is not prefetched then it will raise an error
-    with pytest.raises(EntryNotFoundError):
-        weight_hub_files("gpt2")
-
-
-def test_weight_hub_files_offline_ok(prefetched, offline):
-    # If the model is prefetched then we should be able to get the weight files from local cache
-    filenames = weight_hub_files(prefetched)
-    root = None
-    assert len(filenames) == 1
-    for f in filenames:
-        curroot, filename = os.path.split(f)
-        if root is None:
-            root = curroot
-        else:
-            assert root == curroot
-        assert filename == "model.safetensors"
-
-
-def test_weight_hub_files():
-    filenames = weight_hub_files("bigscience/bloom-560m")
-    assert filenames == ["model.safetensors"]
-
-
-def test_weight_hub_files_llm():
-    filenames = weight_hub_files("bigscience/bloom")
-    assert filenames == [f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)]
-
-
-def test_weight_hub_files_empty():
-    with pytest.raises(EntryNotFoundError):
-        weight_hub_files("bigscience/bloom", extension=".errors")
-
-
-def test_download_weights():
-    model_id = "bigscience/bloom-560m"
-    filenames = weight_hub_files(model_id)
-    files = download_weights(filenames, model_id)
-    local_files = weight_files("bigscience/bloom-560m")
-    assert files == local_files
-
-
-def test_weight_files_revision_error():
-    with pytest.raises(RevisionNotFoundError):
-        weight_files("bigscience/bloom-560m", revision="error")
-
-
-def test_weight_files_not_cached_error(fresh_cache):
-    with pytest.raises(LocalEntryNotFoundError):
-        weight_files("bert-base-uncased")
--- a/backends/gaudi/server/tests/utils/test_layers.py
+++ b/backends/gaudi/server/tests/utils/test_layers.py
@ -1,82 +0,0 @@
-import torch
-from text_generation_server.layers import (
-    TensorParallelEmbedding,
-)
-
-
-class ProcessGroup:
-    def __init__(self, rank: int, world_size: int):
-        self._rank = rank
-        self.world_size = world_size
-
-    def size(self) -> int:
-        return self.world_size
-
-    def rank(self) -> int:
-        return self._rank
-
-
-class Weights:
-    def __init__(self, rank: int, world_size: int, vocab_size: int, hidden_dim: int):
-        self.weight = (
-            torch.arange(vocab_size * hidden_dim).float().view(vocab_size, hidden_dim)
-        )
-        self.process_group = ProcessGroup(rank, world_size)
-
-    def get_partial_sharded(self, name: str, dim: int):
-        assert dim == 0
-
-        rank = self.process_group.rank()
-        world_size = self.process_group.size()
-        size = self.weight.shape[dim]
-
-        block_size = (size + world_size - 1) // world_size
-        start = rank * block_size
-        stop = (rank + 1) * block_size
-        return self.weight[start:stop]
-
-    def get_shape(self, name: str):
-        return self.weight.shape
-
-
-def test_weight_hub_files_offline_error():
-
-    vocab_size = 17
-    weights = Weights(
-        rank=0,
-        world_size=1,
-        vocab_size=vocab_size,
-        hidden_dim=256,
-    )
-    embeddings = TensorParallelEmbedding("", weights)
-
-    input_ids = torch.arange(vocab_size)
-    output = embeddings.forward(input_ids)
-    assert embeddings.min_id == 0
-    assert embeddings.max_id == 17
-    torch.testing.assert_close(output, torch.arange(256 * 17).float().view(17, 256))
-
-    weights_0_2 = Weights(rank=0, world_size=2, vocab_size=vocab_size, hidden_dim=256)
-    weights_1_2 = Weights(rank=1, world_size=2, vocab_size=vocab_size, hidden_dim=256)
-    embeddings_0_2 = TensorParallelEmbedding("", weights_0_2, reduce=False)
-    assert embeddings_0_2.min_id == 0
-    assert embeddings_0_2.max_id == 9
-    torch.testing.assert_close(
-        embeddings_0_2.weight,
-        torch.cat([torch.arange(9 * 256), torch.zeros(256)], dim=0)
-        .view(10, 256)
-        .float(),
-    )
-    embeddings_1_2 = TensorParallelEmbedding("", weights_1_2, reduce=False)
-    assert embeddings_1_2.min_id == 9
-    assert embeddings_1_2.max_id == 17
-    torch.testing.assert_close(
-        embeddings_1_2.weight,
-        torch.cat([torch.arange(8 * 256) + 9 * 256, torch.zeros(256)], dim=0)
-        .view(9, 256)
-        .float(),
-    )
-    output_tp_0 = embeddings_0_2.forward(input_ids)
-    output_tp_1 = embeddings_1_2.forward(input_ids)
-
-    torch.testing.assert_close(output, output_tp_0 + output_tp_1)
--- a/backends/gaudi/server/tests/utils/test_tokens.py
+++ b/backends/gaudi/server/tests/utils/test_tokens.py
@ -1,88 +0,0 @@
-import torch
-from text_generation_server.utils.tokens import (
-    StopSequenceCriteria,
-    StoppingCriteria,
-    FinishReason,
-    batch_top_tokens,
-)
-
-
-def test_stop_sequence_criteria():
-    criteria = StopSequenceCriteria("/test;")
-
-    assert not criteria("/")
-    assert not criteria("/test")
-    assert criteria("/test;")
-    assert not criteria("/test; ")
-
-
-def test_stop_sequence_criteria_escape():
-    criteria = StopSequenceCriteria("<|stop|>")
-
-    assert not criteria("<")
-    assert not criteria("<|stop")
-    assert criteria("<|stop|>")
-    assert not criteria("<|stop|> ")
-
-
-def test_stopping_criteria():
-    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
-    assert criteria(65827, "/test") == (False, None)
-    assert criteria(30, ";") == (True, FinishReason.FINISH_REASON_STOP_SEQUENCE)
-
-
-def test_stopping_criteria_eos():
-    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
-    assert criteria(1, "") == (False, None)
-    assert criteria(0, "") == (True, FinishReason.FINISH_REASON_EOS_TOKEN)
-
-
-def test_stopping_criteria_max():
-    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
-
-
-def test_batch_top_tokens():
-    top_n_tokens = [0, 2, 3, 4, 5]
-    top_n_tokens_tensor = torch.tensor(top_n_tokens)
-    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5)
-    accepted_ids = torch.ones_like(top_n_tokens_tensor)
-
-    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
-        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
-    )
-
-    assert topn_tok_ids[0] == [[]]
-    assert topn_tok_ids[1] == [[0, 3]]
-    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
-
-    assert topn_tok_logprobs[0] == [[]]
-    assert topn_tok_logprobs[1] == [[-1, -2]]
-    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
-
-    # Now let's make second member of the batch be speculated
-    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5 * 2)
-    accepted_ids[1] = 2
-    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
-        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
-    )
-
-    assert topn_tok_ids[0] == [[]]
-    assert topn_tok_ids[1] == [[0, 3], [0, 3]]
-    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
-    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
-
-    assert topn_tok_logprobs[0] == [[]]
-    assert topn_tok_logprobs[1] == [[-1, -2], [-1, -2]]
-    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
-    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
--- a/backends/gaudi/server/tests/utils/test_watermark.py
+++ b/backends/gaudi/server/tests/utils/test_watermark.py
@ -1,54 +0,0 @@
-# test_watermark_logits_processor.py
-
-import os
-import numpy as np
-import torch
-from text_generation_server.utils.watermark import WatermarkLogitsProcessor
-
-
-GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
-DELTA = os.getenv("WATERMARK_DELTA", 2.0)
-
-
-def test_seed_rng():
-    input_ids = [101, 2036, 3731, 102, 2003, 103]
-    processor = WatermarkLogitsProcessor()
-    processor._seed_rng(input_ids)
-    assert isinstance(processor.rng, torch.Generator)
-
-
-def test_get_greenlist_ids():
-    input_ids = [101, 2036, 3731, 102, 2003, 103]
-    processor = WatermarkLogitsProcessor()
-    result = processor._get_greenlist_ids(input_ids, 10, torch.device("cpu"))
-    assert max(result) <= 10
-    assert len(result) == int(10 * 0.5)
-
-
-def test_calc_greenlist_mask():
-    processor = WatermarkLogitsProcessor()
-    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
-    greenlist_token_ids = torch.tensor([2, 3])
-    result = processor._calc_greenlist_mask(scores, greenlist_token_ids)
-    assert result.tolist() == [[False, False, False, False], [False, False, True, True]]
-    assert result.shape == scores.shape
-
-
-def test_bias_greenlist_logits():
-    processor = WatermarkLogitsProcessor()
-    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
-    green_tokens_mask = torch.tensor(
-        [[False, False, True, True], [False, False, False, True]]
-    )
-    greenlist_bias = 2.0
-    result = processor._bias_greenlist_logits(scores, green_tokens_mask, greenlist_bias)
-    assert np.allclose(result.tolist(), [[0.5, 0.3, 2.2, 2.8], [0.1, 0.2, 0.7, 2.9]])
-    assert result.shape == scores.shape
-
-
-def test_call():
-    input_ids = [101, 2036, 3731, 102, 2003, 103]
-    processor = WatermarkLogitsProcessor()
-    scores = torch.tensor([[0.5, 0.3, 0.2, 0.8], [0.1, 0.2, 0.7, 0.9]])
-    result = processor(input_ids, scores)
-    assert result.shape == scores.shape
--- a/backends/gaudi/server/tests/utils/test_weights.py
+++ b/backends/gaudi/server/tests/utils/test_weights.py
--- a/backends/gaudi/server/text_generation_server/models/causal_lm.py
+++ b/backends/gaudi/server/text_generation_server/models/causal_lm.py
@ -57,8 +57,7 @@ MAX_TOTAL_TOKENS = int(os.environ.get("MAX_TOTAL_TOKENS", 2048))
 PAD_SEQUENCE_TO_MULTIPLE_OF = int(os.environ.get("PAD_SEQUENCE_TO_MULTIPLE_OF", 256))
 CHUNK_SIZES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 LAZY_MODE = int(os.environ.get("PT_HPU_LAZY_MODE", 1))
-BATCH_BUCKET_SIZE = int(os.environ.get("BATCH_BUCKET_SIZE", 8))
-PREFILL_BATCH_BUCKET_SIZE = int(os.environ.get("PREFILL_BATCH_BUCKET_SIZE", 2))
+BATCH_SIZE_EXPONENT_BASE = int(os.environ.get("BATCH_SIZE_EXPONENT_BASE", 2))
 MAX_BATCH_SIZE = (
    int(os.environ.get("MAX_BATCH_SIZE"))
    if os.environ.get("MAX_BATCH_SIZE") is not None
@ -74,10 +73,16 @@ def torch_compile_for_eager(func):
    )


-def round_up(number, k):
+def round_up_seq(number, k):
    return (number + k - 1) // k * k


+def round_up_batch(number):
+    return BATCH_SIZE_EXPONENT_BASE ** (
+        math.ceil(math.log(number, BATCH_SIZE_EXPONENT_BASE))
+    )
+
+
 def to_tensor_indices(indices, device):
    return torch.tensor(indices, dtype=torch.long, device=device)

@ -399,7 +404,7 @@ class CausalLMBatch(Batch):

        total_requests = sum(len(b) for b in batches)
        new_bs = total_requests
-        new_bs = round_up(total_requests, BATCH_BUCKET_SIZE)
+        new_bs = round_up_batch(total_requests)

        batch_id = batches[0].batch_id
        device = batches[0].input_ids.device
@ -540,7 +545,7 @@ class CausalLMBatch(Batch):
        # TODO: by tokenizing all inputs at once we loose information on actual input lengths
        # this means that we cannot shift inputs to the left after a long input sequence
        # was filtered out
-        new_bs = round_up(len(requests), PREFILL_BATCH_BUCKET_SIZE)
+        new_bs = round_up_batch(len(requests))
        missing_inputs = new_bs - len(inputs)
        dummy_inputs = ["?"] * missing_inputs
        parameters = [r.parameters for r in pb.requests]
@ -572,7 +577,7 @@ class CausalLMBatch(Batch):
            assert (
                PAD_SEQUENCE_TO_MULTIPLE_OF <= max_input_length
            ), "PAD_SEQUENCE_TO_MULTIPLE_OF cannot be higher than max_input_length"
-            rounded_seq_len = round_up(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF)
+            rounded_seq_len = round_up_seq(input_len + 1, PAD_SEQUENCE_TO_MULTIPLE_OF)
            if rounded_seq_len <= max_input_length:
                bucket_size = rounded_seq_len - 1
            else:
@ -1068,10 +1073,10 @@ class CausalLM(Model):
        if (
            self.enable_hpu_graph
            and self.limit_hpu_graph
-            and round_up(batch.batch_size, BATCH_BUCKET_SIZE) != self.prev_bs
+            and round_up_batch(batch.batch_size) != self.prev_bs
        ):
            self.model.clear_cache()
-            self.prev_bs = round_up(batch.batch_size, BATCH_BUCKET_SIZE)
+            self.prev_bs = round_up_batch(batch.batch_size)
        dbg_trace(
            scenario,
            f"bs:{batch.batch_size} num_reqs:{len(batch.requests)} seq_len:{batch.seq_length} padding:{batch.right_padding}",
@ -1325,15 +1330,14 @@ class CausalLM(Model):

        # Warmup prefill batch_size
        max_input_tokens = request.max_input_tokens
+        max_exp = math.ceil(math.log(max_prefill_batch_size, BATCH_SIZE_EXPONENT_BASE))
        prefill_batch_size_list = [
-            batch
-            for batch in range(
-                PREFILL_BATCH_BUCKET_SIZE,
-                max_prefill_batch_size,
-                PREFILL_BATCH_BUCKET_SIZE,
+            BATCH_SIZE_EXPONENT_BASE**exp
+            for exp in range(
+                0,
+                max_exp + 1,
            )
        ]
-        prefill_batch_size_list.append(max_prefill_batch_size)
        prefill_seqlen_list = [
            seq
            for seq in range(
@ -1370,12 +1374,10 @@ class CausalLM(Model):
        )

        max_decode_batch_size = math.floor(MAX_BATCH_TOTAL_TOKENS / MAX_TOTAL_TOKENS)
-        max_decode_batch_size = round_up(max_decode_batch_size, BATCH_BUCKET_SIZE)
+        max_exp = math.ceil(math.log(max_decode_batch_size, BATCH_SIZE_EXPONENT_BASE))
        decode_batch_size_list = [
-            i
-            for i in range(BATCH_BUCKET_SIZE, max_decode_batch_size, BATCH_BUCKET_SIZE)
+            BATCH_SIZE_EXPONENT_BASE**exp for exp in range(0, max_exp + 1)
        ]
-        decode_batch_size_list.append(max_decode_batch_size)
        decode_batch_size_list.sort(reverse=True)

        try:
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "3.2.1-dev0"
+    "version": "3.2.3-dev0"
  },
  "paths": {
    "/": {
--- a/docs/source/backends/gaudi.mdx
+++ b/docs/source/backends/gaudi.mdx
@ -20,7 +20,7 @@ hf_token=YOUR_HF_ACCESS_TOKEN

 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
    --model-id $model
 ```

@ -41,40 +41,18 @@ curl 127.0.0.1:8080/generate \

 ## How-to Guides

-### How to Run Specific Models
+You can view the full list of supported models in the [Supported Models](https://huggingface.co/docs/text-generation-inference/backends/gaudi#supported-models) section.

-The following models have been validated on Gaudi2:
-
-| Model                 | Model ID                               | BF16        |            | FP8         |            |
-|-----------------------|----------------------------------------|-------------|------------|-------------|------------|
-|                       |                                        | Single Card | Multi-Card | Single Card | Multi-Card |
-| Llama2-7B             | meta-llama/Llama-2-7b-chat-hf          | ✔           | ✔          | ✔           | ✔          |
-| Llama2-70B            | meta-llama/Llama-2-70b-chat-hf         |             | ✔          |             | ✔          |
-| Llama3-8B             | meta-llama/Meta-Llama-3.1-8B-Instruct  | ✔           | ✔          | ✔           | ✔          |
-| Llama3-70B            | meta-llama/Meta-Llama-3-70B-Instruct   |             | ✔          |             | ✔          |
-| Llama3.1-8B           | meta-llama/Meta-Llama-3.1-8B-Instruct  | ✔           | ✔          | ✔           | ✔          |
-| Llama3.1-70B          | meta-llama/Meta-Llama-3.1-70B-Instruct |             | ✔          |             | ✔          |
-| CodeLlama-13B         | codellama/CodeLlama-13b-hf             | ✔           | ✔          | ✔           | ✔          |
-| Mixtral-8x7B          | mistralai/Mixtral-8x7B-Instruct-v0.1   | ✔           | ✔          | ✔           | ✔          |
-| Mistral-7B            | mistralai/Mistral-7B-Instruct-v0.3     | ✔           | ✔          | ✔           | ✔          |
-| Falcon-180B           | tiiuae/falcon-180B-chat                |             | ✔          |             | ✔          |
-| Qwen2-72B             | Qwen/Qwen2-72B-Instruct                |             | ✔          |             | ✔          |
-| Starcoder2-3b         | bigcode/starcoder2-3b                  | ✔           | ✔          | ✔           |            |
-| Starcoder2-15b        | bigcode/starcoder2-15b                 | ✔           | ✔          | ✔           |            |
-| Starcoder             | bigcode/starcoder                      | ✔           | ✔          | ✔           | ✔          |
-| Gemma-7b              | google/gemma-7b-it                     | ✔           | ✔          | ✔           | ✔          |
-| Llava-v1.6-Mistral-7B | llava-hf/llava-v1.6-mistral-7b-hf      | ✔           | ✔          | ✔           | ✔          |
-
-To run any of these models:
+For example, to run Llama3.1-8B, you can use the following command:

 ```bash
-model=MODEL_ID_THAT_YOU_WANT_TO_RUN
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 hf_token=YOUR_ACCESS_TOKEN

 docker run --runtime=habana --cap-add=sys_nice --ipc=host \
    -p 8080:80 -v $volume:/data -e HF_TOKEN=$hf_token \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
    --model-id $model
    <text-generation-inference-launcher-arguments>
 ```
@ -87,7 +65,7 @@ The validated docker commands can be found in the [examples/docker_commands fold

 ### How to Enable Multi-Card Inference (Sharding)

-TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards.
+TGI-Gaudi supports sharding for multi-card inference, allowing you to distribute the load across multiple Gaudi cards. This is recommended to run large models and to speed up inference.

 For example, on a machine with 8 Gaudi cards, you can run:

@ -137,7 +115,7 @@ docker run -p 8080:80 \
   -e BATCH_BUCKET_SIZE=256 \
   -e PREFILL_BATCH_BUCKET_SIZE=4 \
   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
-   ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
   --model-id $model \
   --sharded true --num-shard 8 \
   --max-input-tokens 1024 --max-total-tokens 2048 \
@ -163,7 +141,7 @@ docker run -p 8080:80 \
   -v $volume:/data \
    -e PREFILL_BATCH_BUCKET_SIZE=1 \
    -e BATCH_BUCKET_SIZE=1 \
-   ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+   ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
   --model-id $model \
   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
   --max-total-tokens 8192 --max-batch-size 4
@ -230,7 +208,7 @@ docker run --runtime=habana --ipc=host --cap-add=sys_nice \
    -e PROF_PATH=/tmp/hpu_profile \
    -e PROF_RANKS=0 \
    -e PROF_RECORD_SHAPES=True \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-gaudi \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-gaudi \
    --model-id $model
 ```

@ -270,6 +248,45 @@ Note: Model warmup can take several minutes, especially for FP8 inference. For f

 This section contains reference information about the Gaudi backend.

+### Supported Models
+
+Text Generation Inference enables serving optimized models on Gaudi hardware. The following sections list which models (VLMs & LLMs) are supported on Gaudi.
+
+**Large Language Models (LLMs)**
+- [Llama2-7B](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+- [Llama2-70B](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf)
+- [Llama3-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+- [Llama3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)
+- [LLama3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+- [LLama3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)
+- [CodeLlama-13B](https://huggingface.co/codellama/CodeLlama-13b-hf)
+- [Opt-125m](https://huggingface.co/facebook/opt-125m)
+- [OpenAI-gpt2](https://huggingface.co/openai-community/gpt2)
+- [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+- [Qwen2-72B](https://huggingface.co/Qwen/Qwen2-72B-Instruct)
+- [Qwen2-7B](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
+- [Phi-1.5](https://huggingface.co/microsoft/phi-1_5)
+- [Gemma-7b](https://huggingface.co/google/gemma-7b-it)
+- [Starcoder2-3b](https://huggingface.co/bigcode/starcoder2-3b)
+- [Starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b)
+- [Starcoder](https://huggingface.co/bigcode/starcoder)
+- [falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)
+- [Falcon-180B](https://huggingface.co/tiiuae/falcon-180B-chat)
+- [GPT-2](https://huggingface.co/openai-community/gpt2)
+- [gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
+
+**Vision-Language Models (VLMs)**
+- [LLaVA-v1.6-Mistral-7B](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+- [Mllama (Multimodal Llama from Meta)](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
+- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b)
+- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b)
+- [Idefics 2.5](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)
+- [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
+- [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)
+
+We also support on a best effort basis models with different parameters count that use the same model architecture but those models were not tested. For example, the gaudi backend supports `meta-llama/Llama-3.2-1B` as the architecture is the standard llama3 architecture. If you have an issue with a model, please open an issue on the [Gaudi backend repository](https://github.com/huggingface/text-generation-inference/issues).
+
 ### Environment Variables

 The following table contains the environment variables that can be used to configure the Gaudi backend:
--- a/docs/source/backends/neuron.md
+++ b/docs/source/backends/neuron.md
@ -31,7 +31,7 @@ deployment instructions in the model card:
 The service is launched simply by running the text-generation-inference container with two sets of parameters:

 ```
-docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.1-neuron <service_parameters>
+docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:3.2.3-neuron <service_parameters>
 ```

 - system parameters are used to map ports, volumes and devices between the host and the service,
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 \
    --model-id $model
 ```
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -19,7 +19,7 @@ bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model --quantize bitsandbytes
 ```

 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
@ -27,7 +27,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize bitsandbytes-nf4
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model --quantize bitsandbytes-nf4
 ```

 You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
@ -48,7 +48,7 @@ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇

 ```bash
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $model --quantize gptq
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.2.3 --model-id $model --quantize gptq
 ```

 Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -11,7 +11,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-rocm \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-rocm \
    --model-id $model
 ```

--- a/docs/source/installation_intel.md
+++ b/docs/source/installation_intel.md
@ -12,7 +12,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-intel-xpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-intel-xpu \
    --model-id $model --cuda-graphs 0
 ```

@ -29,7 +29,7 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 docker run --rm --privileged --cap-add=sys_nice \
    --device=/dev/dri \
    --ipc=host --shm-size 1g --net host -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1-intel-cpu \
+    ghcr.io/huggingface/text-generation-inference:3.2.3-intel-cpu \
    --model-id $model --cuda-graphs 0
 ```

--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 \
+    ghcr.io/huggingface/text-generation-inference:3.2.3 \
    --model-id $model
 ```

--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -11,7 +11,7 @@ model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
-    ghcr.io/huggingface/text-generation-inference:3.2.1 \
+    ghcr.io/huggingface/text-generation-inference:3.2.3 \
    --model-id $model
 ```

@ -96,7 +96,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:3.2.1 --help
+docker run ghcr.io/huggingface/text-generation-inference:3.2.3 --help
 ```

 </Tip>
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@ -163,7 +163,7 @@ hub = {

 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.1"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="3.2.3"),
 env=hub,
 role=role,
 )
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -9,6 +9,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Idefics 3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (Multimodal)
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
+- [Llama4](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
 - [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
 - [Gemma](https://huggingface.co/google/gemma-7b)
--- a/flake.lock
+++ b/flake.lock
@ -853,11 +853,11 @@
        ]
      },
      "locked": {
-        "lastModified": 1742783666,
-        "narHash": "sha256-IwdSl51NL6V0f+mYXZR0UTKaGleOsk9zV3l6kt5SUWw=",
+        "lastModified": 1743993291,
+        "narHash": "sha256-u8GHvduU1gCtoFXvTS/wGjH1ouv5S/GRGq6MAT+sG/k=",
        "owner": "oxalica",
        "repo": "rust-overlay",
-        "rev": "60766d63c227d576510ecfb5edd3a687d56f6bc7",
+        "rev": "0cb3c8979c65dc6a5812dfe67499a8c7b8b4325b",
        "type": "github"
      },
      "original": {
@ -978,11 +978,11 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1742807335,
-        "narHash": "sha256-580CXhhCcK1cXRWahk/mDKo6zUsOD2JvNOg5SBBMAKc=",
+        "lastModified": 1743931123,
+        "narHash": "sha256-MDQrbJkweLYsMYh44Gx+c1gAZOCR1fmZF1lkavAHDto=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "8d9ea4691f49369aa5d3230a51366ff6c744d658",
+        "rev": "1ad3feaadfdedca90278ee7676bca15019519189",
        "type": "github"
      },
      "original": {
--- a/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4.json
+++ b/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4.json
@ -0,0 +1,613 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 100,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2721,
+        "logprob": -0.21582031,
+        "special": false,
+        "text": " people"
+      },
+      {
+        "id": 21807,
+        "logprob": -0.26953125,
+        "special": false,
+        "text": " died"
+      },
+      {
+        "id": 310,
+        "logprob": -0.95703125,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 290,
+        "logprob": -1.3359375,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -1.3828125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": -0.011291504,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": -0.011413574,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.23242188,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": -0.0010070801,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 26,
+        "logprob": -0.69140625,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 114059,
+        "logprob": -1.4375,
+        "special": false,
+        "text": " Estimating"
+      },
+      {
+        "id": 290,
+        "logprob": -0.24316406,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 10593,
+        "logprob": -0.37304688,
+        "special": false,
+        "text": " death"
+      },
+      {
+        "id": 49973,
+        "logprob": -0.025390625,
+        "special": false,
+        "text": " toll"
+      },
+      {
+        "id": 323,
+        "logprob": -0.27539062,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": -0.057617188,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -0.040527344,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": -0.00050735474,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": -9.298325e-06,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.09863281,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": -0.0011749268,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 373,
+        "logprob": -0.32421875,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 8210,
+        "logprob": -0.58203125,
+        "special": false,
+        "text": " difficult"
+      },
+      {
+        "id": 2895,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " because"
+      },
+      {
+        "id": 323,
+        "logprob": -1.2734375,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 49119,
+        "logprob": -0.51171875,
+        "special": false,
+        "text": " incomplete"
+      },
+      {
+        "id": 13308,
+        "logprob": -0.38085938,
+        "special": false,
+        "text": " records"
+      },
+      {
+        "id": 341,
+        "logprob": -0.55859375,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 2895,
+        "logprob": -0.765625,
+        "special": false,
+        "text": " because"
+      },
+      {
+        "id": 323,
+        "logprob": -1.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2304,
+        "logprob": -1.015625,
+        "special": false,
+        "text": " fact"
+      },
+      {
+        "id": 511,
+        "logprob": -0.004638672,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 2233,
+        "logprob": -0.953125,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 323,
+        "logprob": -0.87890625,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": -0.60546875,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 6759,
+        "logprob": -1.6484375,
+        "special": false,
+        "text": " extra"
+      },
+      {
+        "id": 40657,
+        "logprob": -0.00022125244,
+        "special": false,
+        "text": " deaths"
+      },
+      {
+        "id": 1610,
+        "logprob": -0.67578125,
+        "special": false,
+        "text": " were"
+      },
+      {
+        "id": 702,
+        "logprob": -0.30664062,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 48692,
+        "logprob": -0.1953125,
+        "special": false,
+        "text": " attributed"
+      },
+      {
+        "id": 328,
+        "logprob": -0.0079956055,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 290,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.0040893555,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 26,
+        "logprob": -0.083496094,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13618,
+        "logprob": -0.515625,
+        "special": false,
+        "text": " Many"
+      },
+      {
+        "id": 22215,
+        "logprob": -1.5703125,
+        "special": false,
+        "text": " experts"
+      },
+      {
+        "id": 11081,
+        "logprob": -0.96875,
+        "special": false,
+        "text": " believe"
+      },
+      {
+        "id": 511,
+        "logprob": -0.1171875,
+        "special": false,
+        "text": " that"
+      },
+      {
+        "id": 290,
+        "logprob": -0.25195312,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 220,
+        "logprob": -0.828125,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": -0.00010967255,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": -8.535385e-05,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.056152344,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": -0.0007095337,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 26132,
+        "logprob": -0.18847656,
+        "special": false,
+        "text": " killed"
+      },
+      {
+        "id": 1867,
+        "logprob": -0.71484375,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 220,
+        "logprob": -0.0062561035,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1175,
+        "logprob": -0.009277344,
+        "special": false,
+        "text": "50"
+      },
+      {
+        "id": 341,
+        "logprob": -0.15332031,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 220,
+        "logprob": -8.34465e-07,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 1135,
+        "logprob": -0.00065612793,
+        "special": false,
+        "text": "100"
+      },
+      {
+        "id": 5534,
+        "logprob": -1.4066696e-05,
+        "special": false,
+        "text": " million"
+      },
+      {
+        "id": 2721,
+        "logprob": -0.0008392334,
+        "special": false,
+        "text": " people"
+      },
+      {
+        "id": 26,
+        "logprob": -0.54296875,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 372,
+        "logprob": -1.8046875,
+        "special": false,
+        "text": " I"
+      },
+      {
+        "id": 140680,
+        "logprob": -0.578125,
+        "special": false,
+        "text": "assistant"
+      },
+      {
+        "id": 200006,
+        "logprob": 0.0,
+        "special": true,
+        "text": "<|header_end|>"
+      },
+      {
+        "id": 368,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 954,
+        "logprob": -0.032226562,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 220,
+        "logprob": -4.4345856e-05,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 7284,
+        "logprob": 0.0,
+        "special": false,
+        "text": "191"
+      },
+      {
+        "id": 36,
+        "logprob": 0.0,
+        "special": false,
+        "text": "8"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.015625,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 27650,
+        "logprob": 0.0,
+        "special": false,
+        "text": " pandemic"
+      },
+      {
+        "id": 24,
+        "logprob": -0.0072021484,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1437,
+        "logprob": -0.0001707077,
+        "special": false,
+        "text": " also"
+      },
+      {
+        "id": 5711,
+        "logprob": 0.0,
+        "special": false,
+        "text": " known"
+      },
+      {
+        "id": 486,
+        "logprob": 0.0,
+        "special": false,
+        "text": " as"
+      },
+      {
+        "id": 290,
+        "logprob": -5.9604645e-07,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 25836,
+        "logprob": -1.4305115e-06,
+        "special": false,
+        "text": " Spanish"
+      },
+      {
+        "id": 18938,
+        "logprob": -0.0015029907,
+        "special": false,
+        "text": " flu"
+      },
+      {
+        "id": 24,
+        "logprob": -0.0052490234,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 373,
+        "logprob": -0.3125,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 26078,
+        "logprob": -0.21289062,
+        "special": false,
+        "text": " indeed"
+      },
+      {
+        "id": 1085,
+        "logprob": -0.080078125,
+        "special": false,
+        "text": " one"
+      },
+      {
+        "id": 323,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 290,
+        "logprob": 0.0,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2167,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": " most"
+      },
+      {
+        "id": 92679,
+        "logprob": -0.12695312,
+        "special": false,
+        "text": " devastating"
+      },
+      {
+        "id": 854,
+        "logprob": -0.25976562,
+        "special": false,
+        "text": " public"
+      },
+      {
+        "id": 4500,
+        "logprob": 0.0,
+        "special": false,
+        "text": " health"
+      },
+      {
+        "id": 93079,
+        "logprob": -0.50390625,
+        "special": false,
+        "text": " crises"
+      },
+      {
+        "id": 310,
+        "logprob": 0.0,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 6023,
+        "logprob": -0.0015182495,
+        "special": false,
+        "text": " human"
+      },
+      {
+        "id": 7068,
+        "logprob": 0.0,
+        "special": false,
+        "text": " history"
+      },
+      {
+        "id": 26,
+        "logprob": -0.0012664795,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 114059,
+        "logprob": -0.004119873,
+        "special": false,
+        "text": " Estimating"
+      },
+      {
+        "id": 290,
+        "logprob": -0.00033569336,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 6318,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": " exact"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
+}
--- a/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json
+++ b/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_base64_rgb_jpg.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image is a blank white space with no visible objects or features. It appears to be an empty or placeholder image, devoid of any content or visual elements.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743861910,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 34,
+    "prompt_tokens": 166,
+    "total_tokens": 200
+  }
+}
--- a/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json
+++ b/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_base64_rgb_png.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image is a blank white space with no visible objects or features.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743861909,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 15,
+    "prompt_tokens": 166,
+    "total_tokens": 181
+  }
+}
--- a/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json
+++ b/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_base64_rgba.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image is a black background with no discernible objects or features. The image appears to be a blank or empty space, devoid of any visual elements.\n\n**Key Features:**\n\n* **Color:** The dominant color of the image is black.\n* **Objects:** There are no visible objects or shapes in the image.\n* **Background:** The background of the image is a solid black color.\n\n**Conclusion:**\nIn summary, the image is a simple and empty visual representation with a black background and no",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743861909,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 100,
+    "prompt_tokens": 166,
+    "total_tokens": 266
+  }
+}
--- a/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_cow.json
+++ b/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_cow.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743863057,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 46,
+    "prompt_tokens": 164,
+    "total_tokens": 210
+  }
+}
--- a/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_cow_dog.json
+++ b/integration-tests/models/snapshots/test_transformers_llama4/test_flash_llama4_image_cow_dog.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify.",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1743863056,
+  "id": "",
+  "model": "ll-re/Llama-4-Scout-17B-16E-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.1-dev0-native",
+  "usage": {
+    "completion_tokens": 30,
+    "prompt_tokens": 168,
+    "total_tokens": 198
+  }
+}
--- a/integration-tests/models/test_transformers_llama4.py
+++ b/integration-tests/models/test_transformers_llama4.py
@ -0,0 +1,155 @@
+# import base64
+# from io import BytesIO
+# from PIL import Image
+#
+# import pytest
+#
+#
+# @pytest.fixture(scope="module")
+# def flash_llama4_handle(launcher):
+#     with launcher("ll-re/Llama-4-Scout-17B-16E-Instruct", num_shard=8) as handle:
+#         yield handle
+#
+#
+# @pytest.fixture(scope="module")
+# async def flash_llama4(flash_llama4_handle):
+#     await flash_llama4_handle.health(300)
+#     return flash_llama4_handle.client
+#
+#
+# async def test_flash_llama4(flash_llama4, response_snapshot):
+#     response = await flash_llama4.generate(
+#         "Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many",
+#         seed=42,
+#         max_new_tokens=100,
+#     )
+#
+#     assert (
+#         response.generated_text
+#         == " people died in the 1918 flu pandemic. Estimating the death toll of the 1918 flu pandemic is difficult because of incomplete records and because of the fact that many of the extra deaths were not attributed to the flu. Many experts believe that the 1918 flu pandemic killed between 50 and 100 million people. Iassistant\n\nThe 1918 flu pandemic, also known as the Spanish flu, is indeed one of the most devastating public health crises in human history. Estimating the exact"
+#     )
+#     assert response.details.generated_tokens == 100
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_cow_dog(flash_llama4, response_snapshot):
+#     image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": image_url}},
+#                     {
+#                         "type": "text",
+#                         "text": "What is the breed of the dog in the image?",
+#                     },
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#
+#     assert (
+#         response.choices[0].message.content
+#         == "The image does not depict a dog; it shows a cow standing on a beach. Therefore, there is no breed of a dog to identify."
+#     )
+#     assert response.usage["completion_tokens"] == 30
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_cow(flash_llama4, response_snapshot):
+#     image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": image_url}},
+#                     {"type": "text", "text": "What is shown in this image?"},
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert (
+#         response.choices[0].message.content
+#         == "The image shows a brown cow standing on the beach with a white face and black and white marking on its ears. The cow has a white patch around its nose and mouth. The ocean and blue sky are in the background."
+#     )
+#     assert response.usage["completion_tokens"] == 46
+#     assert response == response_snapshot
+#
+#
+# # Helper function to convert a Pillow image to a base64 data URL
+# def image_to_data_url(img: Image.Image, fmt: str) -> str:
+#     buffer = BytesIO()
+#     img.save(buffer, format=fmt)
+#     img_data = buffer.getvalue()
+#     b64_str = base64.b64encode(img_data).decode("utf-8")
+#     mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
+#     return f"data:{mime_type};base64,{b64_str}"
+#
+#
+# async def test_flash_llama4_image_base64_rgba(flash_llama4, response_snapshot):
+#     # Create an empty 100x100 PNG image with alpha (transparent background)
+#     img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
+#     data_url = image_to_data_url(img, "PNG")
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": data_url}},
+#                     {
+#                         "type": "text",
+#                         "text": "What do you see in this transparent image?",
+#                     },
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_base64_rgb_png(flash_llama4, response_snapshot):
+#     # Create an empty 100x100 PNG image without alpha (white background)
+#     img = Image.new("RGB", (100, 100), (255, 255, 255))
+#     data_url = image_to_data_url(img, "PNG")
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": data_url}},
+#                     {"type": "text", "text": "What do you see in this plain image?"},
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert response == response_snapshot
+#
+#
+# async def test_flash_llama4_image_base64_rgb_jpg(flash_llama4, response_snapshot):
+#     # Create an empty 100x100 JPEG image (white background)
+#     img = Image.new("RGB", (100, 100), (255, 255, 255))
+#     data_url = image_to_data_url(img, "JPEG")
+#     response = await flash_llama4.chat(
+#         seed=42,
+#         messages=[
+#             {
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "image_url", "image_url": {"url": data_url}},
+#                     {"type": "text", "text": "What do you see in this JPEG image?"},
+#                 ],
+#             },
+#         ],
+#         max_tokens=100,
+#     )
+#     assert response == response_snapshot
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1709,16 +1709,16 @@ impl From<&str> for Gpu {
 impl std::fmt::Display for Gpu {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Gpu::RTX4090 => write!(f, "nvida-4090"),
-            Gpu::T4 => write!(f, "nvida-t4"),
-            Gpu::L4 => write!(f, "nvida-l4"),
-            Gpu::L40 => write!(f, "nvida-l40"),
-            Gpu::L40S => write!(f, "nvida-l40s"),
+            Gpu::RTX4090 => write!(f, "nvidia-4090"),
+            Gpu::T4 => write!(f, "nvidia-t4"),
+            Gpu::L4 => write!(f, "nvidia-l4"),
+            Gpu::L40 => write!(f, "nvidia-l40"),
+            Gpu::L40S => write!(f, "nvidia-l40s"),
            Gpu::A10G => write!(f, "nvidia-a10g"),
            Gpu::A40 => write!(f, "nvidia-a40"),
            Gpu::H100 => write!(f, "nvidia-h100-80fb-hbm3"),
-            Gpu::A100 => write!(f, "nvida-a100-sxm4-80gb"),
-            Gpu::H200 => write!(f, "nvida-h200"),
+            Gpu::A100 => write!(f, "nvidia-a100-sxm4-80gb"),
+            Gpu::H200 => write!(f, "nvidia-h200"),
            Gpu::Unknown(card) => write!(f, "{}", card),
        }
    }
--- a/nix/overlay.nix
+++ b/nix/overlay.nix
@ -18,8 +18,18 @@ final: prev: {
            src = final.fetchFromGitHub {
              owner = "huggingface";
              repo = "transformers";
-              rev = "v4.50.0";
-              hash = "sha256-/scrMPUY43n+XAMbwWCtmiJKXscXGLrklyDg9XZTaqw=";
+              rev = "v4.51.0";
+              hash = "sha256-dnVpc6fm1SYGcx7FegpwVVxUY6XRlsxLs5WOxYv11y8=";
+            };
+          }
+        );
+        huggingface-hub = python-super.huggingface-hub.overrideAttrs (
+          _: _: {
+            src = final.fetchFromGitHub {
+              owner = "huggingface";
+              repo = "huggingface_hub";
+              rev = "v0.30.0";
+              hash = "sha256-sz+n1uoWrSQPqJFiG/qCT6b4r08kD9MsoPZXbfWNB2o=";
            };
          }
        );
--- a/nix/server.nix
+++ b/nix/server.nix
@ -17,6 +17,7 @@
  grpcio-status,
  grpcio-tools,
  hf-transfer,
+  hf-xet,
  kernels,
  loguru,
  mamba-ssm,
@ -92,6 +93,7 @@ buildPythonPackage {
    grpcio-status
    grpcio-tools
    hf-transfer
+    hf-xet
    kernels
    loguru
    mamba-ssm
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -1,4 +1,5 @@
 use serde::{Deserialize, Serialize};
+use std::collections::{HashMap, HashSet};

 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
@ -103,6 +104,141 @@ impl LlavaNext {
    }
 }

+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Llama4VisionConfig {
+    image_size: usize,
+    patch_size: usize,
+    pixel_shuffle_ratio: f64,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Llama4 {
+    text_config: TextConfig,
+    vision_config: Llama4VisionConfig,
+}
+
+fn gcd(a: usize, b: usize) -> usize {
+    if b == 0 {
+        a
+    } else {
+        gcd(b, a % b)
+    }
+}
+
+fn get_factors(dividend: usize) -> HashSet<usize> {
+    let mut factors_set = HashSet::new();
+
+    for i in 1..=((dividend as f64).sqrt() as usize) {
+        if dividend % i == 0 {
+            factors_set.insert(i);
+            factors_set.insert(dividend / i);
+        }
+    }
+
+    factors_set
+}
+
+fn find_supported_resolutions(max_num_chunks: usize, height: usize) -> Vec<(usize, usize)> {
+    let patch_size = height;
+
+    let mut asp_dict: HashMap<(usize, usize), Vec<(usize, usize)>> = HashMap::new();
+
+    for chunk_size in (1..=max_num_chunks).rev() {
+        let mut _factors: Vec<_> = get_factors(chunk_size).into_iter().collect();
+        _factors.sort();
+        let _asp_ratios: Vec<(usize, usize)> =
+            _factors.iter().map(|&f| (f, chunk_size / f)).collect();
+
+        for (h, w) in _asp_ratios {
+            let divisor = gcd(h, w);
+            let key = (h / divisor, w / divisor); // reduced aspect ratio as key
+
+            asp_dict.entry(key).or_default().push((h, w));
+        }
+    }
+
+    let mut possible_resolutions = vec![];
+
+    for (_key, value) in asp_dict {
+        for (h, w) in value {
+            possible_resolutions.push((h * patch_size, w * patch_size));
+        }
+    }
+
+    possible_resolutions
+}
+
+fn get_best_fit(
+    original_height: usize,
+    original_width: usize,
+    possible_resolutions: &[(usize, usize)],
+    resize_to_max_canvas: bool,
+) -> (usize, usize) {
+    let orig_h = original_height as f32;
+    let orig_w = original_width as f32;
+
+    let mut scales = Vec::with_capacity(possible_resolutions.len());
+
+    for &(h, w) in possible_resolutions.iter() {
+        let scale_h = h as f32 / orig_h;
+        let scale_w = w as f32 / orig_w;
+        let scale = scale_h.min(scale_w);
+        scales.push(scale);
+    }
+
+    let upscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s >= 1.0).collect();
+    let selected_scale = if !upscaling_options.is_empty() {
+        if resize_to_max_canvas {
+            upscaling_options.into_iter().fold(f32::MIN, f32::max)
+        } else {
+            upscaling_options.into_iter().fold(f32::MAX, f32::min)
+        }
+    } else {
+        let downscaling_options: Vec<f32> = scales.iter().copied().filter(|&s| s < 1.0).collect();
+        downscaling_options.into_iter().fold(f32::MIN, f32::max)
+    };
+
+    let chosen_canvas: Vec<(usize, usize)> = possible_resolutions
+        .iter()
+        .zip(scales.iter())
+        .filter(|&(_, &s)| (s - selected_scale).abs() < f32::EPSILON)
+        .map(|(&(h, w), _)| (h, w))
+        .collect();
+
+    if chosen_canvas.len() > 1 {
+        chosen_canvas
+            .into_iter()
+            .min_by_key(|(h, w)| h * w)
+            .unwrap()
+    } else {
+        chosen_canvas[0]
+    }
+}
+
+impl Llama4 {
+    pub fn image_size(&self) -> usize {
+        self.vision_config.image_size
+    }
+
+    pub fn patch_size(&self) -> usize {
+        self.vision_config.patch_size
+    }
+
+    pub fn pixel_shuffle_ratio(&self) -> f64 {
+        self.vision_config.pixel_shuffle_ratio
+    }
+    pub fn get_aspect_ratios(&self, height: usize, width: usize) -> (usize, usize) {
+        let patch_size = self.vision_config.image_size;
+        // How to avoid hardcoding this?
+        let max_chunks = 15;
+        let supported = find_supported_resolutions(max_chunks, patch_size);
+        let (target_h, target_w) = get_best_fit(height, width, &supported, false);
+        (target_h / patch_size, target_w / patch_size)
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ClipVisionModel {
@ -187,20 +323,20 @@ impl Qwen2Vl {
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct Qwen2_5VlVisionConfig {
-    pub(crate) depth: usize,
-    pub(crate) hidden_act: String,
-    pub(crate) hidden_size: usize,
-    pub(crate) intermediate_size: usize,
-    pub(crate) num_heads: usize,
-    pub(crate) in_chans: usize,
-    pub(crate) out_hidden_size: usize,
-    pub(crate) patch_size: usize,
-    pub(crate) spatial_merge_size: usize,
+    // pub(crate) depth: usize,
+    // pub(crate) hidden_act: String,
+    // pub(crate) hidden_size: usize,
+    // pub(crate) intermediate_size: usize,
+    // pub(crate) num_heads: usize,
+    // pub(crate) in_chans: usize,
+    // pub(crate) out_hidden_size: usize,
+    // pub(crate) patch_size: usize,
+    // pub(crate) spatial_merge_size: usize,
    pub(crate) spatial_patch_size: usize,
-    pub(crate) window_size: usize,
-    pub(crate) fullatt_block_indexes: Vec<usize>,
-    pub(crate) tokens_per_second: usize,
-    pub(crate) temporal_patch_size: usize,
+    // pub(crate) window_size: usize,
+    // pub(crate) fullatt_block_indexes: Vec<usize>,
+    // pub(crate) tokens_per_second: usize,
+    // pub(crate) temporal_patch_size: usize,
 }

 #[derive(Clone, Debug, Serialize, Deserialize)]
@ -212,7 +348,7 @@ pub struct Qwen2_5Vl {
 impl Qwen2_5Vl {
    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
        let num_pixels = height * width;
-        num_pixels / self.vision_config.patch_size.pow(2)
+        num_pixels / self.vision_config.spatial_patch_size.pow(2)
    }
 }

@ -258,6 +394,7 @@ pub enum Config {
    Phi3,
    Phimoe,
    Llama,
+    Llama4(Llama4),
    Baichuan,
    Paligemma(Paligemma),
    Gemma,
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -179,6 +179,7 @@ pub enum HubPreprocessorConfig {
    Idefics2Processor(Idefics2Preprocessor),
    Idefics3Processor(Idefics2Preprocessor),
    Gemma3Processor(Gemma3Processor),
+    Llama4Processor(Llama4Processor),
 }

 impl HubPreprocessorConfig {
@ -200,6 +201,12 @@ pub struct Gemma3Processor {
    do_image_splitting: bool,
 }

+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Llama4Processor {
+    #[serde(default)]
+    do_image_splitting: bool,
+}
+
 #[derive(Debug, Clone, Deserialize, Default)]
 pub struct HubProcessorConfig {
    pub chat_template: Option<ChatTemplateVersions>,
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -74,11 +74,8 @@ fn encoding_to_tokens(encoding: &tokenizers::Encoding, input: &str) -> Vec<Simpl
            .iter()
            .zip(offsets)
            .map(|(&id, &(start, stop))| {
-                let text = input
-                    .chars()
-                    .skip(start)
-                    .take(stop - start)
-                    .collect::<String>();
+                let text: Vec<u8> = input.bytes().skip(start).take(stop - start).collect();
+                let text: String = String::from_utf8_lossy(&text).to_string();
                SimpleToken {
                    id,
                    text,
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -566,7 +566,7 @@ fn fetch_image(input: &str) -> Result<(Vec<u8>, String, usize, usize), Validatio
            return Err(ValidationError::InvalidImageContent(content.to_string()));
        }

-        let data = STANDARD.decode(content["base64,".len()..].as_bytes())?;
+        let data = STANDARD.decode(&content["base64,".len()..])?;
        let img = if let Some(format) = format_from_mimetype(mimetype) {
            ImageReader::with_format(Cursor::new(&data), format).decode()?
        } else {
@ -603,7 +603,7 @@ fn image_tokens(

            let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
            image_string.push_str(FAKE);
-            image_string.extend(iter::repeat(IMAGE).take(slots));
+            image_string.extend(iter::repeat_n(IMAGE, slots));
            image_string.push_str(FAKE);

            if matches!(
@ -687,6 +687,47 @@ fn image_tokens(
        }
        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
+        Llama4(config) => {
+            const IMAGE_START: &str = "<|image_start|>";
+            const IMAGE: &str = "<|image|>";
+            const IMAGE_END: &str = "<|image_end|>";
+            const PATCH: &str = "<|patch|>";
+            const TILE_X_SEP: &str = "<|tile_x_separator|>";
+            const TILE_Y_SEP: &str = "<|tile_y_separator|>";
+
+            let image_height = config.image_size();
+            let patch_size = config.patch_size();
+            let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
+            let downsample_ratio =
+                (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;
+
+            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width);
+            let image_width = image_height; // Assuming pixel shape: [H][W][C]
+
+            let num_patches_per_chunk =
+                (image_height / patch_size) * (image_width / patch_size) / downsample_ratio;
+
+            let mut img_string = String::new();
+            img_string.push_str(IMAGE_START);
+
+            if ratio_h * ratio_w > 1 {
+                for _yy in 0..ratio_h {
+                    for xx in 0..ratio_w {
+                        img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
+                        if xx < ratio_w - 1 {
+                            img_string.push_str(TILE_X_SEP);
+                        }
+                    }
+                    img_string.push_str(TILE_Y_SEP);
+                }
+            }
+
+            img_string.push_str(IMAGE);
+            img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
+            img_string.push_str(IMAGE_END);
+
+            img_string
+        }
        Qwen2Vl(config) => format!(
            "<|vision_start|>{:?}<|vision_end|>",
            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
@ -730,8 +771,8 @@ fn prepare_input<T: TokenizerTrait>(
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
        Some(
-            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Paligemma(_)
-            | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
+            config @ (Idefics | Mllama | Idefics2(_) | Idefics3(_) | Gemma3(_) | Llama4(_)
+            | Paligemma(_) | LlavaNext(_) | Qwen2Vl(_) | Qwen2_5Vl(_)),
        ) => {
            let mut input_chunks = Vec::new();
            let mut tokenizer_query = String::with_capacity(inputs.len());
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@ -6,8 +6,7 @@ build-vllm-rocm:
 		git clone https://github.com/mht-sharma/vllm.git vllm; \
 	fi
 	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
-	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python3 setup.py bdist_wheel --dist-dir=dist

 install-vllm-rocm: build-vllm-rocm
-	cd vllm && git fetch && git checkout $(commit_rocm) && \
-	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
+	cd vllm && git fetch && git checkout $(commit_rocm)
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -31,8 +31,22 @@ dependencies = [
    "sentencepiece>=0.2.0",
    "tokenizers>=0.20.3",
    "typer>=0.15.1",
-    "transformers>=4.49.0",
-    "huggingface-hub>=0.29.0",
+    "transformers>=4.51.0",
+    "huggingface-hub>=0.30.1",
+    "hf-xet>=1.0.0",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu124", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+torchvision = [
+  { index = "pytorch-cu124", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]

 [build-system]
@ -77,6 +91,10 @@ gen = [
    "grpcio-tools>=1.69.0",
    "mypy-protobuf>=3.6.0",
 ]
+torch = [
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+]

 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -206,7 +206,13 @@ try:
    from text_generation_server.models.transformers_flash_causal_lm import (
        TransformersFlashCausalLM,
    )
-except ImportError:
+    from text_generation_server.models.transformers_flash_vlm import (
+        TransformersFlashVlmCausalLM,
+        TransformersGemma3VlmCausalLM,
+        TransformersLlama4VlmCausalLM,
+    )
+except ImportError as e:
+    log_master(logger.warning, f"Could not import Flash Transformers Backend: {e}")
    FLASH_TRANSFORMERS_BACKEND = False


@ -244,6 +250,11 @@ class ModelType(enum.Enum):
        "name": "Llama",
        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
    }
+    LLAMA4 = {
+        "type": "llama4",
+        "name": "Llama4",
+        "url": "https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f",
+    }
    PHI3 = {
        "type": "phi3",
        "name": "Phi 3",
@ -648,7 +659,6 @@ def get_model(
        raise ValueError(
            f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
        )
-
    if model_type == DEEPSEEK_V2:
        if FLASH_ATTENTION:
            head_size = max(
@ -1017,7 +1027,23 @@ def get_model(
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
+    elif model_type == LLAMA4:
+        if FLASH_TRANSFORMERS_BACKEND:
+            from transformers import Llama4ForConditionalGeneration as Llama4Model

+            return TransformersLlama4VlmCausalLM.fallback(
+                model_id,
+                Llama4Model,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                processor_kwargs={
+                    "use_fast": True,
+                    "size": {"height": 336, "width": 336},
+                },
+            )
    elif model_type == BAICHUAN:
        if FLASH_ATTENTION:
            return FlashCausalLM(
@ -1155,7 +1181,6 @@ def get_model(
            )
    elif model_type == GEMMA3:
        if FLASH_ATTENTION:
-            # TODO: Use VlmCausalLM when image support is added.
            return VlmCausalLM(
                model_id=model_id,
                model_class=Gemma3ForConditionalGeneration,
@ -1173,12 +1198,15 @@ def get_model(
                lora_adapter_ids=lora_adapter_ids,
            )
        elif FLASH_TRANSFORMERS_BACKEND:
-            return TransformersFlashCausalLM.fallback(
+            from transformers import Gemma3ForConditionalGeneration as Gemma3Model
+
+            return TransformersGemma3VlmCausalLM.fallback(
                model_id,
+                Gemma3Model,
                revision,
                quantize=quantize,
                speculator=speculator,
-                dtype=dtype,
+                dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
            )
        elif sharded:
@ -1483,33 +1511,65 @@ def get_model(
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == QWEN2_VL:
-        return VlmCausalLM(
-            model_id=model_id,
-            model_class=Qwen2VLForConditionalGeneration,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            default_dtype=torch.bfloat16,
-            kv_cache_dtype=kv_cache_dtype,
-            trust_remote_code=trust_remote_code,
-            lora_adapter_ids=lora_adapter_ids,
-        )
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Qwen2VLForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        # TODO: Uncomment when transformers is refactored
+        # elif FLASH_TRANSFORMERS_BACKEND:
+        #     from transformers import Qwen2VLForConditionalGeneration as Qwen2VLModel
+
+        #     return TransformersQwen2VlmCausalLM.fallback(
+        #         model_id,
+        #         Qwen2VLModel,
+        #         revision,
+        #         quantize=quantize,
+        #         speculator=speculator,
+        #         dtype=torch.bfloat16,
+        #         trust_remote_code=trust_remote_code,
+        #     )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Qwen2_VL"))
    if model_type == QWEN2_5_VL:
-        return VlmCausalLM(
-            model_id=model_id,
-            model_class=Qwen2_5VLForConditionalGeneration,
-            revision=revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            default_dtype=torch.bfloat16,
-            kv_cache_dtype=kv_cache_dtype,
-            trust_remote_code=trust_remote_code,
-            lora_adapter_ids=lora_adapter_ids,
-            config_class=Qwen2_5_VLConfig,
-            processor_class=Qwen2_5_VLProcessor,
-        )
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Qwen2_5VLForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                default_dtype=torch.bfloat16,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=Qwen2_5_VLConfig,
+                processor_class=Qwen2_5_VLProcessor,
+            )
+        # TODO: Uncomment when transformers is refactored
+        # elif FLASH_TRANSFORMERS_BACKEND:
+        #     return TransformersQwen2VlmCausalLM.fallback(
+        #         model_id,
+        #         Qwen2VLModel,
+        #         revision,
+        #         quantize=quantize,
+        #         speculator=speculator,
+        #         dtype=torch.bfloat16,
+        #         trust_remote_code=trust_remote_code,
+        #         config_class=Qwen2_5_VLConfig,
+        #         processor_class=Qwen2_5_VLProcessor,
+        #     )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Qwen2_5_VL"))
    if model_type == MLLAMA:
        if FLASH_ATTENTION:
            return MllamaCausalLM(
@ -1524,6 +1584,20 @@ def get_model(
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
            )
+        # TODO: Uncomment when transformers is refactored and cross attn is added
+        # elif FLASH_TRANSFORMERS_BACKEND:
+        #     from transformers import MllamaForConditionalGeneration as MllamaModel
+
+        #     return TransformersFlashVlmCausalLM.fallback(
+        #         model_id,
+        #         MllamaModel,
+        #         revision,
+        #         quantize=quantize,
+        #         speculator=speculator,
+        #         dtype=torch.bfloat16,
+        #         trust_remote_code=trust_remote_code,
+        #         batch_class=MllamaCausalLMBatch,
+        #     )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Mllama"))
    if model_type == IDEFICS2:
@ -1542,6 +1616,19 @@ def get_model(
                # VRAM usage.
                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
            )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import Idefics2ForConditionalGeneration as Idefics2Model
+
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                Idefics2Model,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
+            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == IDEFICS3:
@ -1560,6 +1647,19 @@ def get_model(
                # VRAM usage.
                processor_kwargs={"size": {"longest_edge": 1456}},
            )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import Idefics3ForConditionalGeneration as Idefics3Model
+
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                Idefics3Model,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                processor_kwargs={"size": {"longest_edge": 1456}},
+            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
    if model_type == PALIGEMMA:
@ -1578,9 +1678,21 @@ def get_model(
                lora_adapter_ids=lora_adapter_ids,
                batch_class=PaliGemmaBatch,
            )
-        else:
-            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import PaliGemmaForConditionalGeneration as PaliGemmaModel

+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                PaliGemmaModel,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                batch_class=PaliGemmaBatch,
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("PaliGemma"))
    if model_type == LLAVA_NEXT:
        if FLASH_ATTENTION:
            return VlmCausalLM(
@ -1593,6 +1705,18 @@ def get_model(
                kv_cache_dtype=kv_cache_dtype,
                trust_remote_code=trust_remote_code,
            )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            from transformers import LlavaNextForConditionalGeneration as LlavaNextModel
+
+            return TransformersFlashVlmCausalLM.fallback(
+                model_id,
+                LlavaNextModel,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("LlavaNext"))

--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1344,9 +1344,6 @@ class FlashCausalLM(Model):
    def batch_type(self) -> Type[FlashCausalLMBatch]:
        return FlashCausalLMBatch

-    def max_past(self) -> int:
-        return getattr(self.model, "max_past", None)
-
    def init_kv_cache(
        self,
        num_blocks: int,
@ -1792,12 +1789,6 @@ class FlashCausalLM(Model):
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

-        if cu_seqlen_prefill is None and self.max_past() is not None:
-            # In decode, not prefill, we're actually overwriting the KV-cache
-            # in a circular buffer mode.
-            # This makes sure the max_s for the decode pass is correct.
-            max_s = min(self.max_past(), max_s)
-
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
        if sorted_padded_bs:
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@ -256,12 +256,6 @@ class MllamaCausalLM(VlmCausalLM):
            max_s = batch.max_current_length
            lm_head_indices = batch.prefill_head_indices

-        if cu_seqlen_prefill is None and self.max_past() is not None:
-            # In decode, not prefill, we're actually overwriting the KV-cache
-            # in a circular buffer mode.
-            # This makes sure the max_s for the decode pass is correct.
-            max_s = min(self.max_past(), max_s)
-
        # Try to find an associated cuda graph
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
--- a/server/text_generation_server/models/transformers_flash_causal_lm.py
+++ b/server/text_generation_server/models/transformers_flash_causal_lm.py
@ -36,6 +36,7 @@ def tgi_flash_attention_forward(
    softcap: Optional[float] = None,
    **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
 ):
+
    kv_cache = kv_cache[module.layer_idx]
    query_states = query_states.transpose(1, 2).squeeze(dim=0)
    key_states = key_states.transpose(1, 2).squeeze(dim=0)
@ -72,6 +73,7 @@ def tgi_flash_attention_forward(
            max_s,
            kv_scales=kv_scales,
            softcap=softcap,
+            window_size_left=sliding_window,
        )

    attn_output = attn_output.view(-1, num_heads * head_dim)
@ -157,7 +159,14 @@ class TransformersFlashCausalLM(FlashCausalLM):
        self.num_layers = model.config.num_hidden_layers
        self.num_heads = model.config.num_attention_heads
        self.num_kv_heads = model.config.num_key_value_heads
-        self.head_size = model.config.hidden_size // model.config.num_attention_heads
+        # Some models use GQA and different sizes for o_proj
+        # and q_proj, that allows for that.
+        if hasattr(model.config, "head_dim"):
+            self.head_size = model.config.head_dim
+        else:
+            self.head_size = (
+                model.config.hidden_size // model.config.num_attention_heads
+            )

        # Skip it for models in the exception list
        if model.config.model_type not in REPLICATED_ATTENTION_MODELS:
--- a/server/text_generation_server/models/transformers_flash_vlm.py
+++ b/server/text_generation_server/models/transformers_flash_vlm.py
@ -0,0 +1,566 @@
+import math
+from typing import List, Optional
+
+import torch
+from opentelemetry import trace
+from transformers import AutoTokenizer, AutoProcessor
+import transformers.modeling_utils
+
+from text_generation_server.models.flash_causal_lm import FlashCausalLM
+from text_generation_server.models.vlm_causal_lm import VlmCausalLM, VlmCausalLMBatch
+from text_generation_server.utils import initialize_torch_distributed
+
+from text_generation_server.layers.attention import paged_attention, attention, Seqlen
+from text_generation_server.layers.attention.kv_cache import KVScales, KVCache
+from text_generation_server.models.globals import ATTENTION
+import torch.nn.functional as F
+
+tracer = trace.get_tracer(__name__)
+
+# The base TP plan of these models has replicated q/k/v. This means that each process will see the full states,
+# hence we should not divide the number of heads by the world size. This is a known waste of VRAM (the cache
+# will be fully replicated on each process) and GPU communication (additional all-gather operations), however due
+# to internal constraints it was not (yet?) possible to circumvent
+REPLICATED_ATTENTION_MODELS = [
+    "olmo2",
+    "phi3",
+]
+
+
+# # Qwen2VL
+# transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_VISION_ATTENTION_CLASSES[
+#     "tgi"
+# ] = transformers.models.qwen2_vl.modeling_qwen2_vl.QWEN2_VL_VISION_ATTENTION_CLASSES[
+#     "eager"
+# ]
+def tgi_flash_attention_forward(
+    module,
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],  # This is a positional arg in Transformers
+    kv_cache: List[KVCache],
+    kv_head_mapping: torch.Tensor,
+    slots: torch.Tensor,
+    cu_seqlen_prefill: Optional[torch.Tensor],
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    max_s: int,
+    kv_scales: KVScales,
+    softmax_scale: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
+    use_sdpa: Optional[bool] = False,
+    **kwargs,  # This is needed to "absorb" other args passed by Transformers modeling
+):
+    kv_cache = kv_cache[module.layer_idx]
+    query_states = query_states.transpose(1, 2).squeeze(dim=0)
+    key_states = key_states.transpose(1, 2).squeeze(dim=0)
+    value_states = value_states.transpose(1, 2).squeeze(dim=0)
+
+    # Take care of updating the cache in-place
+    kv_cache.store(key=key_states, value=value_states, slots=slots, kv_scales=kv_scales)
+
+    _, num_heads, head_dim = query_states.shape
+    softmax_scale = 1 / math.sqrt(head_dim) if softmax_scale is None else softmax_scale
+    sliding_window = -1 if sliding_window is None else sliding_window
+
+    if cu_seqlen_prefill is not None:
+        if not use_sdpa:
+            attn_output = attention(
+                query=query_states,
+                key=key_states,
+                value=value_states,
+                kv_cache=kv_cache,
+                kv_scales=kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=softmax_scale,
+                window_size_left=sliding_window,
+                softcap=softcap,
+            )
+        else:
+            lengths = cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]
+            max_length = max(lengths)
+            attention_mask = attention_mask[:, :, :, :max_length]
+            enable_gqa = query_states.shape[1] != key_states.shape[1]
+            # Split tensors using vectorized split
+            query_list = torch.split(query_states, lengths.tolist(), dim=0)
+            key_list = torch.split(key_states, lengths.tolist(), dim=0)
+            value_list = torch.split(value_states, lengths.tolist(), dim=0)
+
+            padded_query = torch.nn.utils.rnn.pad_sequence(query_list, batch_first=True)
+            padded_key = torch.nn.utils.rnn.pad_sequence(key_list, batch_first=True)
+            padded_value = torch.nn.utils.rnn.pad_sequence(value_list, batch_first=True)
+
+            padded_query = padded_query.transpose(1, 2).contiguous()
+            padded_key = padded_key.transpose(1, 2).contiguous()
+            padded_value = padded_value.transpose(1, 2).contiguous()
+
+            # Compute attention
+            attn_output = F.scaled_dot_product_attention(
+                padded_query,
+                padded_key,
+                padded_value,
+                attn_mask=attention_mask,
+                scale=softmax_scale,
+                enable_gqa=enable_gqa,
+            )
+
+            attn_output = attn_output.transpose(
+                1, 2
+            )  # [batch_size, seq_len, num_heads, head_dim]
+            max_seq_len = padded_query.size(2)
+            seq_range = torch.arange(max_seq_len, device=padded_query.device).unsqueeze(
+                0
+            )
+            lengths_tensor = torch.tensor(
+                lengths, device=padded_query.device
+            ).unsqueeze(1)
+            mask = seq_range < lengths_tensor  # [batch, max_seq_len]
+            attn_output = attn_output[mask]  # [total_seq_len, num_heads, head_dim]
+
+    else:
+        attn_output = paged_attention(
+            query_states,
+            kv_cache,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            seqlen,
+            max_s,
+            kv_scales=kv_scales,
+            softcap=softcap,
+            window_size_left=sliding_window,
+        )
+
+    attn_output = attn_output.view(-1, num_heads * head_dim)
+
+    return attn_output, None
+
+
+transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS["tgi"] = tgi_flash_attention_forward
+
+
+# TODO: implement
+# tgi_cross_attention_forward
+
+
+class TransformersFlashVlmCausalLM(VlmCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        processor_class=AutoProcessor,
+        processor_kwargs=None,
+        kv_cache_dtype: Optional[torch.dtype] = None,
+        batch_class=VlmCausalLMBatch,
+    ):
+        self.batch_class = batch_class
+        self.quantize = quantize
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        self.dtype = dtype
+
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            device = torch.device("xpu")
+            dtype = default_dtype if dtype is None else dtype
+        else:
+            raise ValueError(
+                "Flash `Transformers` modeling backend is not available on cpu."
+            )
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        if processor_kwargs is None:
+            processor_kwargs = {}
+
+        self.processor = processor_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **processor_kwargs,
+        )
+
+        attn_implementation = {
+            "text_config": "tgi",
+            "vision_config": "sdpa",
+        }
+
+        model = model_class.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=dtype,
+            load_in_8bit=quantize == "bitsandbytes",
+            trust_remote_code=trust_remote_code,
+            attn_implementation=attn_implementation,
+            device_map=device if world_size == 1 else None,
+            tp_plan="auto" if world_size > 1 else None,
+        )
+
+        torch.distributed.barrier(group=self.process_group)
+        self.config = model.config
+        config = model.config
+
+        # VLM models define the config we care about in their text_config
+        text_config = getattr(model.config, "text_config", None)
+        if text_config is not None:
+            config = text_config
+
+        if tokenizer.pad_token_id is None:
+            if model.config.pad_token_id is not None:
+                tokenizer.pad_token_id = model.config.pad_token_id
+            elif model.config.eos_token_id is not None and isinstance(
+                model.config.eos_token_id, int
+            ):
+                tokenizer.pad_token_id = model.config.eos_token_id
+            elif tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        self.num_layers = config.num_hidden_layers
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        # Some models use GQA and different sizes for o_proj
+        # and q_proj, that allows for that.
+        if hasattr(config, "head_dim"):
+            self.head_size = config.head_dim
+        else:
+            self.head_size = config.hidden_size // config.num_attention_heads
+
+        # Skip it for models in the exception list
+        if config.model_type not in REPLICATED_ATTENTION_MODELS:
+            self.num_heads = self.num_heads // self.process_group.size()
+            self.num_kv_heads = (
+                self.num_kv_heads // self.process_group.size()
+                if self.num_kv_heads > 1
+                else self.num_kv_heads
+            )
+
+        self.cuda_graphs = {}
+        self.kv_cache = []
+        self.kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
+
+        if ATTENTION == "flashinfer":
+            from text_generation_server.layers.attention.flashinfer import (
+                create_prefill_state,
+                create_decode_state,
+                create_prefill_with_paged_kv_state,
+            )
+
+            self.prefill_state = create_prefill_state(device=device)
+            self.prefill_with_paged_kv_state = create_prefill_with_paged_kv_state(
+                device=device
+            )
+
+            self.decode_state = create_decode_state(
+                device=device,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+            )
+
+        self.num_groups = self.num_heads // self.num_kv_heads
+
+        # Those will never change and will be used in the forwards
+        self.kv_head_mapping = torch.arange(
+            0, self.num_kv_heads, dtype=torch.int32, device=device
+        ).repeat_interleave(self.num_groups)
+        # This means no scale
+        self.kv_scales = KVScales(
+            torch.tensor(1.0, device=device),
+            torch.tensor(1.0, device=device),
+        )
+
+        # Skip FlashCausalLM init.
+        super(FlashCausalLM, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=False,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+        # Monkey patch of `self.model.forward` to match `FlashCausalLM`. It avoids duplicating a lot of code
+        # We first copy the original model.forward because we still need it in the monkey patch
+        self.model.original_forward = self.model.forward
+        self.model.forward = self._model_forward
+        self.model.get_position_ids = self.get_position_ids
+
+        torch.distributed.barrier(group=self.process_group)
+
+    def get_position_ids(self, input_ids, image_grid_thw, position_ids):
+        return position_ids
+
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        return {
+            "input_ids": input_ids.unsqueeze(0),
+            "position_ids": position_ids.unsqueeze(0),
+        }
+
+    def post_process_outputs(self, logits, lm_head_indices):
+        return logits.squeeze(dim=0)
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+        batch_class: Optional[type] = VlmCausalLMBatch,
+        processor_kwargs: Optional[dict] = None,
+    ):
+        return cls(
+            model_id=model_id,
+            model_class=model_class,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+            batch_class=batch_class,
+            processor_kwargs=processor_kwargs,
+        )
+
+    def _model_forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[KVCache],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        seqlen: Seqlen,
+        max_s: int,
+        lm_head_indices: Optional[torch.Tensor],
+        prefill_cache_indices=None,  # not used, but passed to match original signature
+        adapter_data=None,  # not supported, but passed to match original signature
+        pixel_values: torch.FloatTensor = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+    ):
+        # A value of `None` (i.e. no logit slicing) translates to `0` in Transformers
+        logits_to_keep = lm_head_indices if lm_head_indices is not None else 0
+
+        inputs = self.pre_process_inputs(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+        )
+        # This is equivalent to `self.model.forward`, see the monkey patch in __init__
+        logits = self.model.original_forward(
+            input_ids=inputs["input_ids"],
+            position_ids=inputs["position_ids"],
+            past_key_values=None,  # we use self.kv_cache instead of transformers cache object
+            use_cache=False,  # we use self.kv_cache instead of transformers cache object
+            logits_to_keep=logits_to_keep,
+            return_dict=True,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            seqlen=seqlen,
+            max_s=max_s,
+            kv_head_mapping=self.kv_head_mapping,
+            kv_scales=self.kv_scales,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+            image_sizes=image_sizes,
+            image_grid_thw=image_grid_thw,
+            attention_mask=inputs.get("attention_mask", None),
+            use_sdpa=inputs.get("use_sdpa", False),
+            cache_position=inputs.get("cache_position", None),
+        ).logits
+
+        logits = self.post_process_outputs(logits, lm_head_indices)
+
+        return logits, None
+
+
+class TransformersQwen2VlmCausalLM(TransformersFlashVlmCausalLM):
+    def get_position_ids(self, input_ids: torch.Tensor, image_grid_thw: torch.Tensor):
+        if image_grid_thw is None:
+            return (
+                torch.arange(input_ids.shape[0], device=input_ids.device)
+                .unsqueeze(1)
+                .repeat(1, 3)
+            )
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        vision_start_token_id = self.config.vision_start_token_id
+        vision_end_token_id = self.config.vision_end_token_id
+        device = input_ids.device
+        dtype = input_ids.dtype
+        input_ids_len = input_ids.shape[0]
+
+        vision_starts = torch.where(input_ids == vision_start_token_id)[0]
+        vision_ends = torch.where(input_ids == vision_end_token_id)[0]
+        vision_segments = torch.stack((vision_starts, vision_ends), dim=1)
+        prev_vision_end = torch.cat(
+            [torch.zeros(1, device=vision_ends.device, dtype=dtype), vision_ends[:-1]]
+        )
+        text_lengths_between_vision = vision_segments[:, 0] - prev_vision_end + 1
+        vision_widths_max = torch.cat(
+            [
+                torch.zeros(1, device=image_grid_thw.device, dtype=dtype),
+                image_grid_thw[:-1, 2] // spatial_merge_size,
+            ]
+        )
+        vision_segment_lengths = vision_widths_max + text_lengths_between_vision
+        vision_segment_lengths = vision_segment_lengths.cumsum(dim=0)
+        text_segment_lengths = vision_segment_lengths - text_lengths_between_vision
+
+        # create position ids for each vision segment based on the image grid
+        llm_pos_ids_list = []
+        for i, _ in enumerate(vision_segments):
+            t, h, w = (
+                image_grid_thw[i][0],
+                image_grid_thw[i][1] // spatial_merge_size,
+                image_grid_thw[i][2] // spatial_merge_size,
+            )
+            t_indices = torch.arange(t, device=device).repeat_interleave(h * w)
+            h_indices = torch.arange(h, device=device).repeat_interleave(w).repeat(t)
+            w_indices = torch.arange(w, device=device).repeat(t * h)
+            image_position_ids = torch.stack([t_indices, h_indices, w_indices], dim=0)
+
+            # offset by the position of the last vision segment
+            im = image_position_ids + vision_segment_lengths[i]
+            llm_pos_ids_list.append(im)
+
+        # create position ids for each text segment
+        text_ranges = [
+            torch.arange(seq_len, device=device).view(1, -1).expand(3, -1)
+            + text_segment_lengths[i]
+            for i, seq_len in enumerate(text_lengths_between_vision)
+        ]
+
+        full_llm_pos_ids_list = [
+            item for sublist in zip(text_ranges, llm_pos_ids_list) for item in sublist
+        ]
+        # import ipdb
+
+        # ipdb.set_trace()
+        max_s = full_llm_pos_ids_list[-1].max() + 1
+        final_text_len = input_ids_len - vision_ends[-1]
+        if final_text_len > 0:
+            m = torch.arange(final_text_len, device=device).view(1, -1).expand(3, -1)
+            full_llm_pos_ids_list.append(m + max_s)
+
+        position_ids = (
+            torch.cat(full_llm_pos_ids_list, dim=1).reshape(3, -1).transpose(0, 1)
+        )
+        return position_ids
+
+    def post_process_outputs(self, logits, lm_head_indices):
+        return logits.squeeze(dim=0)[lm_head_indices].unsqueeze(0)
+
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        input_ids = input_ids.unsqueeze(0)
+        position_ids = position_ids.transpose(0, 1).unsqueeze(1)
+        return {"input_ids": input_ids, "position_ids": position_ids}
+
+
+class TransformersGemma3VlmCausalLM(TransformersFlashVlmCausalLM):
+    def get_attention_mask(self, input_ids, cu_seqlen_prefill):
+        device = input_ids.device
+        dtype = self.dtype
+        min_dtype = torch.finfo(dtype).min
+
+        lengths = (cu_seqlen_prefill[1:] - cu_seqlen_prefill[:-1]).tolist()
+        batch_size = len(lengths)
+
+        sequence_length = max(lengths)
+        target_length = sequence_length
+        # Create the padding mask from the computed lengths.
+        # pad_mask: [batch, sequence_length] where True indicates valid tokens.
+        seq_range = torch.arange(sequence_length, device=device).unsqueeze(0)
+        lengths_tensor = torch.tensor(lengths, device=device).unsqueeze(1)
+        pad_mask = seq_range < lengths_tensor  # shape: [batch, sequence_length]
+
+        # Build the base causal mask (for non-image tokens):
+        causal_mask = torch.tril(
+            torch.ones(
+                (sequence_length, sequence_length), dtype=torch.bool, device=device
+            )
+        )
+        base_mask = pad_mask.unsqueeze(2) & pad_mask.unsqueeze(
+            1
+        )  # [batch, sequence_length, sequence_length]
+        base_mask = base_mask & causal_mask.unsqueeze(0)  # apply causal constraint
+
+        image_token_mask = (input_ids == self.config.image_token_index).to(
+            input_ids.device
+        )
+
+        image_token_mask = torch.nn.utils.rnn.pad_sequence(
+            torch.split(image_token_mask, lengths), batch_first=True, padding_value=0
+        )
+        bidirectional_mask = image_token_mask.unsqueeze(2) & image_token_mask.unsqueeze(
+            1
+        )
+
+        # Combine the causal base mask and the bidirectional mask.
+        combined_mask = torch.logical_or(
+            base_mask.unsqueeze(1), bidirectional_mask.unsqueeze(1)
+        ).to(device)
+        # combined_mask now has shape [batch, 1, sequence_length, sequence_length]
+
+        full_attention_mask = torch.zeros(
+            (batch_size, 1, sequence_length, target_length),
+            device=device,
+            dtype=torch.bool,
+        )
+        full_attention_mask[:, :, :, :sequence_length] = combined_mask
+
+        final_attention_mask = torch.where(full_attention_mask, 0, min_dtype).to(device)
+
+        return final_attention_mask
+
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        inputs = {
+            "input_ids": input_ids.unsqueeze(0),
+            "position_ids": position_ids.unsqueeze(0),
+        }
+
+        if cu_seqlen_prefill is not None:
+            attention_mask = self.get_attention_mask(
+                input_ids.squeeze(0), cu_seqlen_prefill
+            )
+            inputs["attention_mask"] = attention_mask
+            inputs["use_sdpa"] = True
+
+        return inputs
+
+
+class TransformersLlama4VlmCausalLM(TransformersFlashVlmCausalLM):
+    def pre_process_inputs(self, input_ids, position_ids, cu_seqlen_prefill):
+        inputs = super().pre_process_inputs(input_ids, position_ids, cu_seqlen_prefill)
+        inputs["cache_position"] = position_ids
+        inputs["attention_mask"] = torch.zeros((1, 1, 1, 1), device=input_ids.device)
+        return inputs
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -29,6 +29,33 @@ IDEFICS3_FAKE_IMAGE_TOKEN = "<fake_token_around_image>"
 IDEFICS3_GLOBAL_IMG_TOKEN = "<global-img>"


+def prompt_split_image_llama4(aspect_ratio, num_patches_per_chunk):
+    """
+    Create a structured string representation of image tokens
+
+    Args:
+       num_patches: Number of patches in the image
+
+    Returns:
+        String with appropriate image tokens
+    """
+    img_string = "<|image_start|>"
+    ratio_h, ratio_w = aspect_ratio
+    if ratio_h * ratio_w > 1:
+        for yy in range(ratio_h):
+            for xx in range(ratio_w):
+                img_string += "<|patch|>" * num_patches_per_chunk
+                if xx < ratio_w - 1:
+                    img_string += "<|tile_x_separator|>"
+
+            img_string += "<|tile_y_separator|>"
+    img_string += "<|image|>"
+    img_string += "<|patch|>" * num_patches_per_chunk
+    img_string += "<|image_end|>"
+
+    return img_string
+
+
 # copied from: https://github.com/huggingface/transformers/blob/02ed609285c2448b3b54c31e362f2c389fa952ab/src/transformers/models/idefics3/processing_idefics3.py#L44-L60
 def _prompt_split_image(
    *,
@ -134,6 +161,23 @@ def image_text_replacement(processor, image_input, config, image_id: int) -> str
        num_pads = 256
        padding = "<image_soft_token>" * num_pads
        return f"\n\n<start_of_image>{padding}<end_of_image>\n\n"
+    elif config.model_type == "llama4":
+        patch_size = config.vision_config.patch_size
+        pixel_shuffle_ratio = config.vision_config.pixel_shuffle_ratio
+        downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
+        aspect_ratios = image_input["aspect_ratios"][image_id]
+        image_height, image_width = image_input["pixel_values"][image_id].shape[-2:]
+
+        num_patches_per_chunk = int(
+            (image_height // patch_size)
+            * (image_width // patch_size)
+            // downsample_ratio
+        )
+        tokens_for_this_image = prompt_split_image_llama4(
+            aspect_ratios, num_patches_per_chunk
+        )
+
+        return tokens_for_this_image
    else:
        raise RuntimeError(f"Unknown config {config.model_type} for multimodal")

@ -252,6 +296,8 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                        images.append(image)
                    elif config.model_type == "gemma3":
                        images.append(image)
+                    elif config.model_type == "llama4":
+                        images.append(image)
                    else:
                        images.append([image])
                else:
@ -285,7 +331,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                        processor, image_inputs, config, image_id
                    )
                    image_id += 1
-
+            # from pdb import set_trace; set_trace()
            full_text = image_text_replacement_fixup(config, full_text)
            input_ids = tokenizer(
                full_text,
@ -372,9 +418,6 @@ class VlmCausalLM(FlashCausalLM):
    def batch_type(self) -> Type[VlmCausalLMBatch]:
        return self.batch_class

-    def max_past(self) -> Optional[int]:
-        return getattr(self.model.text_model, "max_past", None)
-
    def forward(
        self,
        batch: VlmCausalLMBatch,
@ -442,12 +485,6 @@ class VlmCausalLM(FlashCausalLM):
                )
                batch.position_ids = position_ids

-        if cu_seqlen_prefill is None and self.max_past() is not None:
-            # In decode, not prefill, we're actually overwriting the KV-cache
-            # in a circular buffer mode.
-            # This makes sure the max_s for the decode pass is correct.
-            max_s = min(self.max_past(), max_s)
-
        # Try to find an associated cuda graph
        bs = input_ids.shape[0]
        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
--- a/server/uv.lock
+++ b/server/uv.lock
@ -2,10 +2,14 @@ version = 1
 revision = 1
 requires-python = ">=3.9"
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
-    "python_full_version < '3.10'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]

 [[package]]
@ -20,7 +24,8 @@ dependencies = [
    { name = "psutil" },
    { name = "pyyaml" },
    { name = "safetensors" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/85/15/0fab0260ab4069e5224e637d2e400538bb27b0dfc36f17daf68db9770d78/accelerate-1.3.0.tar.gz", hash = "sha256:518631c0adb80bd3d42fb29e7e2dc2256bcd7c786b0ba9119bbaa08611b36d9c", size = 342758 }
 wheels = [
@ -189,7 +194,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/db/9d/9382259196d7ad7f3550702390081224e673a705e75b5660ee377b592fc0/bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:ba3a720187f518b172ebce4081049c682ae3fd8284947e22499b256ff99a2bc3", size = 69680042 },
@ -315,7 +321,8 @@ version = "0.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "pydantic" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
    { name = "transformers" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/40/e0/d9529aae2d2425d214e5a50497df4532d3f9e21c8d2023037c701f8a37d3/compressed-tensors-0.9.1.tar.gz", hash = "sha256:3cf5cd637f0186c184dd5bbbbf941356b1225199b49c6a45bf0909d65907f686", size = 63060 }
@ -707,9 +714,24 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/a1/14/f1e15b851d1c2af5b0b1a82bf8eb10bda2da62d98180220ba6fd8879bb5b/hf_transfer-0.1.9-cp38-abi3-win_amd64.whl", hash = "sha256:16f208fc678911c37e11aa7b586bc66a37d02e636208f18b6bc53d29b5df40ad", size = 1160240 },
 ]

+[[package]]
+name = "hf-xet"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/64/46/db229dddc55121478105940b610fef1b466c414da02be9d4daa5602a2527/hf_xet-1.0.0.tar.gz", hash = "sha256:5e0ca891ce599fd753e7ffbdc182207d952a93e6252eeb92118475d6866bb093", size = 257192 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/0a/c16f8766fa3cd520292b1a765e9b50b8390bce4c2ed7657db9534551f5ed/hf_xet-1.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6106304f92bbce7c9b8509f6f735f2e8ce95e4dc32af8876e874c48b15ca1903", size = 5001841 },
+    { url = "https://files.pythonhosted.org/packages/e3/9f/cca55edd85d03fc98c743bcc093965740a7440e909779c558039d6838f03/hf_xet-1.0.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:4d0bc7a3e6c1d21fcbb48e8726e3b19a2460e95971375e55e9a5f73ec7079a86", size = 4805318 },
+    { url = "https://files.pythonhosted.org/packages/d1/0b/28bda7ac9d699dcfb96f628aa135ddca3f0f77e9716351aab2b83966f957/hf_xet-1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23dee64f114ea9a272ff71a6a755e025d7a075a6a0dbf6da0990fe9211831ccf", size = 53504907 },
+    { url = "https://files.pythonhosted.org/packages/cb/04/ef1f7249a813841d193cbab2ef4d1d7d67c66c61d21d45223a72fdc5c88e/hf_xet-1.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d5f160550508ed87783d1eca4288602a713d1c45ec517d937acb9d93120f0cab", size = 52410434 },
+    { url = "https://files.pythonhosted.org/packages/81/b3/e7abec2619ecd9d1c743adfe79fa69cf84530f530969daf3dc804efef65b/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5ebd79db87df0b9d3607e7c9a6bb0662c10e36992f522f66b1d2a7fe93f53f27", size = 53465113 },
+    { url = "https://files.pythonhosted.org/packages/df/82/b51f3b6e5c6f33e91220c37b17760229704c58e79ab0fcfd0fd3b55803d3/hf_xet-1.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8e6d2625971b4affad634835db82d5392f38de874205a9573e0dd3f0f9cb136f", size = 53461632 },
+    { url = "https://files.pythonhosted.org/packages/95/d2/32defba26d995f7acdc4fe3e5911473b25aff5b75c5a2532786435a709e8/hf_xet-1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:b446964bd75eb7f6b4d983c47241b2023eadfad1f56137ed00e1ca6fc278faec", size = 4121808 },
+]
+
 [[package]]
 name = "huggingface-hub"
-version = "0.29.1"
+version = "0.30.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock" },
@ -720,9 +742,9 @@ dependencies = [
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/22/37/797d6476f13e5ef6af5fc48a5d641d32b39c37e166ccf40c3714c5854a85/huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250", size = 389776 }
+sdist = { url = "https://files.pythonhosted.org/packages/78/be/049689a7197630e75c4bb53021cb209a56617c9bf39b3a0950650d1f96e1/huggingface_hub-0.30.1.tar.gz", hash = "sha256:f379e8b8d0791295602538856638460ae3cf679c7f304201eb80fb98c771950e", size = 400784 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/05/75b90de9093de0aadafc868bb2fa7c57651fd8f45384adf39bd77f63980d/huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5", size = 468049 },
+    { url = "https://files.pythonhosted.org/packages/99/e3/2232d0e726d4d6ea69643b9593d97d0e7e6ea69c2fe9ed5de34d476c1c47/huggingface_hub-0.30.1-py3-none-any.whl", hash = "sha256:0f6aa5ec5a4e68e5b9e45d556b4e5ea180c58f5a5ffa734e7f38c9d573028959", size = 481170 },
 ]

 [[package]]
@ -811,7 +833,8 @@ dependencies = [
    { name = "huggingface-hub" },
    { name = "packaging" },
    { name = "tomli", marker = "python_full_version < '3.11'" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/26/99/41af9dce502bb1682977fee1bc487a73fa8418cebbce16b8d27733947375/kernels-0.2.1.tar.gz", hash = "sha256:918942332819b28377b9d07070daddecfd8a5e7bab574dd3dc64a209ca6008b2", size = 9395 }

@ -1069,7 +1092,8 @@ name = "networkx"
 version = "3.2.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.10'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928 }
 wheels = [
@ -1081,9 +1105,12 @@ name = "networkx"
 version = "3.4.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368 }
 wheels = [
@ -1095,7 +1122,8 @@ name = "numpy"
 version = "2.0.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.10'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 }
 wheels = [
@ -1150,9 +1178,12 @@ name = "numpy"
 version = "2.2.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ec/d0/c12ddfd3a02274be06ffc71f3efc6d0e457b0409c4481596881e748cb264/numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f", size = 20233295 }
 wheels = [
@ -1249,7 +1280,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.1.0.70"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 },
@ -1260,7 +1291,7 @@ name = "nvidia-cufft-cu12"
 version = "11.2.1.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
@ -1279,9 +1310,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.6.1.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
-    { name = "nvidia-cusparse-cu12" },
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
@ -1292,7 +1323,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.3.1.170"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
@ -1494,7 +1525,8 @@ dependencies = [
    { name = "pydantic" },
    { name = "referencing" },
    { name = "requests" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
    { name = "tqdm" },
    { name = "typing-extensions" },
 ]
@ -1617,7 +1649,8 @@ dependencies = [
    { name = "psutil" },
    { name = "pyyaml" },
    { name = "safetensors" },
-    { name = "torch" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
    { name = "tqdm" },
    { name = "transformers" },
 ]
@ -2385,7 +2418,8 @@ name = "scipy"
 version = "1.13.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version < '3.10'",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
@ -2423,9 +2457,12 @@ name = "scipy"
 version = "1.15.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.12'",
-    "python_full_version == '3.11.*'",
-    "python_full_version == '3.10.*'",
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
 ]
 dependencies = [
    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
@ -2563,6 +2600,7 @@ dependencies = [
    { name = "grpcio-reflection" },
    { name = "grpcio-status" },
    { name = "hf-transfer" },
+    { name = "hf-xet" },
    { name = "huggingface-hub" },
    { name = "kernels" },
    { name = "loguru" },
@ -2613,6 +2651,12 @@ quantize = [
    { name = "datasets" },
    { name = "texttable" },
 ]
+torch = [
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "torchvision", version = "0.21.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torchvision", version = "0.21.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]

 [package.metadata]
 requires-dist = [
@ -2628,7 +2672,8 @@ requires-dist = [
    { name = "grpcio-tools", marker = "extra == 'dev'", specifier = ">=1.51.1,<2.0" },
    { name = "grpcio-tools", marker = "extra == 'gen'", specifier = ">=1.69.0" },
    { name = "hf-transfer", specifier = ">=0.1.8" },
-    { name = "huggingface-hub", specifier = ">=0.29.0" },
+    { name = "hf-xet", specifier = ">=1.0.0" },
+    { name = "huggingface-hub", specifier = ">=0.30.1" },
    { name = "kernels", specifier = ">=0.2.1" },
    { name = "loguru", specifier = ">=0.7.3" },
    { name = "mypy-protobuf", marker = "extra == 'gen'", specifier = ">=3.6.0" },
@ -2649,10 +2694,14 @@ requires-dist = [
    { name = "sentencepiece", specifier = ">=0.2.0" },
    { name = "texttable", marker = "extra == 'quantize'", specifier = ">=1.6.7,<2" },
    { name = "tokenizers", specifier = ">=0.20.3" },
-    { name = "transformers", specifier = ">=4.49.0" },
+    { name = "torch", marker = "(sys_platform == 'linux' and extra == 'torch') or (sys_platform == 'win32' and extra == 'torch')", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu124" },
+    { name = "torch", marker = "sys_platform != 'linux' and sys_platform != 'win32' and extra == 'torch'", specifier = "==2.6.0" },
+    { name = "torchvision", marker = "(sys_platform == 'linux' and extra == 'torch') or (sys_platform == 'win32' and extra == 'torch')", specifier = "==0.21.0", index = "https://download.pytorch.org/whl/cu124" },
+    { name = "torchvision", marker = "sys_platform != 'linux' and sys_platform != 'win32' and extra == 'torch'", specifier = "==0.21.0" },
+    { name = "transformers", specifier = ">=4.51.0" },
    { name = "typer", specifier = ">=0.15.1" },
 ]
-provides-extras = ["accelerate", "bnb", "compressed-tensors", "peft", "outlines", "dev", "quantize", "gen"]
+provides-extras = ["accelerate", "bnb", "compressed-tensors", "peft", "outlines", "dev", "quantize", "gen", "torch"]

 [[package]]
 name = "texttable"
@ -2731,12 +2780,46 @@ wheels = [
 name = "torch"
 version = "2.6.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
 dependencies = [
-    { name = "filelock" },
-    { name = "fsspec" },
-    { name = "jinja2" },
-    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "filelock", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "fsspec", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "jinja2", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "setuptools", marker = "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "sympy", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "typing-extensions", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/16/ea1b7842413a7b8a5aaa5e99e8eaf3da3183cc3ab345ad025a07ff636301/torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628", size = 66520221 },
+    { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713 },
+    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
+    { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783 },
+    { url = "https://files.pythonhosted.org/packages/b3/17/41f681b87290a1d2f1394f943e470f8b0b3c2987b7df8dc078d8831fce5b/torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c", size = 66520446 },
+]
+
+[[package]]
+name = "torch"
+version = "2.6.0+cu124"
+source = { registry = "https://download.pytorch.org/whl/cu124" }
+resolution-markers = [
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+]
+dependencies = [
+    { name = "filelock", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "fsspec", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "jinja2", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and sys_platform == 'linux') or (python_full_version >= '3.10' and sys_platform == 'win32')" },
    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
@ -2750,32 +2833,76 @@ dependencies = [
    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setuptools", marker = "python_full_version >= '3.12'" },
-    { name = "sympy" },
+    { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')" },
+    { name = "sympy", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/37/81/aa9ab58ec10264c1abe62c8b73f5086c3c558885d6beecebf699f0dbeaeb/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961", size = 766685561 },
-    { url = "https://files.pythonhosted.org/packages/86/86/e661e229df2f5bfc6eab4c97deb1286d598bbeff31ab0cdb99b3c0d53c6f/torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab", size = 95751887 },
-    { url = "https://files.pythonhosted.org/packages/20/e0/5cb2f8493571f0a5a7273cd7078f191ac252a402b5fb9cb6091f14879109/torch-2.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:56eeaf2ecac90da5d9e35f7f35eb286da82673ec3c582e310a8d1631a1c02341", size = 204165139 },
-    { url = "https://files.pythonhosted.org/packages/e5/16/ea1b7842413a7b8a5aaa5e99e8eaf3da3183cc3ab345ad025a07ff636301/torch-2.6.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:09e06f9949e1a0518c5b09fe95295bc9661f219d9ecb6f9893e5123e10696628", size = 66520221 },
-    { url = "https://files.pythonhosted.org/packages/78/a9/97cbbc97002fff0de394a2da2cdfa859481fdca36996d7bd845d50aa9d8d/torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1", size = 766715424 },
-    { url = "https://files.pythonhosted.org/packages/6d/fa/134ce8f8a7ea07f09588c9cc2cea0d69249efab977707cf67669431dcf5c/torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d", size = 95759416 },
-    { url = "https://files.pythonhosted.org/packages/11/c5/2370d96b31eb1841c3a0883a492c15278a6718ccad61bb6a649c80d1d9eb/torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7", size = 204164970 },
-    { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713 },
-    { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563 },
-    { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867 },
-    { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469 },
-    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538 },
-    { url = "https://files.pythonhosted.org/packages/24/85/ead1349fc30fe5a32cadd947c91bda4a62fbfd7f8c34ee61f6398d38fb48/torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf", size = 766626191 },
-    { url = "https://files.pythonhosted.org/packages/dd/b0/26f06f9428b250d856f6d512413e9e800b78625f63801cbba13957432036/torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b", size = 95611439 },
-    { url = "https://files.pythonhosted.org/packages/c2/9c/fc5224e9770c83faed3a087112d73147cd7c7bfb7557dcf9ad87e1dda163/torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc", size = 204126475 },
-    { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783 },
-    { url = "https://files.pythonhosted.org/packages/40/bb/feb5644baa621fd8e1e88bf51f6fa38ab3f985d472a764144ff4867ac1d6/torch-2.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:9ea955317cfcd3852b1402b62af258ce735c2edeee42ca9419b6bc889e5ae053", size = 766680961 },
-    { url = "https://files.pythonhosted.org/packages/ee/11/08774a8198a33263947c59e04b8a0bf85a61a44e82100c46cf833bbce35e/torch-2.6.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb2c6c3e65049f081940f5ab15c9136c7de40d3f01192541c920a07c7c585b7e", size = 95782656 },
-    { url = "https://files.pythonhosted.org/packages/c1/0d/56fb07032accbfebb4555638b6002ec5678d0942da85497e40f9405ab756/torch-2.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:683410f97984103148e31b38a8631acf31c3034c020c0f4d26171e7626d8317a", size = 204061417 },
-    { url = "https://files.pythonhosted.org/packages/b3/17/41f681b87290a1d2f1394f943e470f8b0b3c2987b7df8dc078d8831fce5b/torch-2.6.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:265f70de5fd45b864d924b64be1797f86e76c8e48a02c2a3a6fc7ec247d2226c", size = 66520446 },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl", hash = "sha256:7f2ba7f7c0459320a521696f6b5bccc187f59890b23c9dfb6c49b0b87c6bfc97" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-win_amd64.whl", hash = "sha256:7cc45c5b39d74875cfafe908b7f55c544147cc16b01e795feb2fe766583efe78" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl", hash = "sha256:d4c3e9a8d31a7c0fcbb9da17c31a1917e1fac26c566a4cfbd8c9568ad7cade79" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-win_amd64.whl", hash = "sha256:6a1fb2714e9323f11edb6e8abf7aad5f79e45ad25c081cde87681a18d99c29eb" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl", hash = "sha256:a393b506844035c0dac2f30ea8478c343b8e95a429f06f3b3cadfc7f53adb597" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-win_amd64.whl", hash = "sha256:3313061c1fec4c7310cf47944e84513dcd27b6173b72a349bb7ca68d0ee6e9c0" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313-linux_x86_64.whl", hash = "sha256:0f3bc53c988ce9568cd876a2a5316761e84a8704135ec8068f5f81b4417979cb" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313-win_amd64.whl", hash = "sha256:519330eef09534acad8110b6f423d2fe58c1d8e9ada999ed077a637a0021f908" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313t-linux_x86_64.whl", hash = "sha256:35cba404c0d742406cdcba1609085874bc60facdfbc50e910c47a92405fef44c" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp39-cp39-linux_x86_64.whl", hash = "sha256:e661267cd0242462ab100bdd67f651988aa9f67eb31609d6909afcac891df612" },
+    { url = "https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp39-cp39-win_amd64.whl", hash = "sha256:c2eb62b99161d87be486c88fd82441274cc892bce8c48dbc28c055cb147732ce" },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.21.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.12' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version == '3.10.*' and sys_platform != 'linux' and sys_platform != 'win32'",
+    "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "pillow", marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'win32'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/0d/143bd264876fad17c82096b6c2d433f1ac9b29cdc69ee45023096976ee3d/torchvision-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:044ea420b8c6c3162a234cada8e2025b9076fa82504758cd11ec5d0f8cd9fa37", size = 1784140 },
+    { url = "https://files.pythonhosted.org/packages/29/88/00c69db213ee2443ada8886ec60789b227e06bb869d85ee324578221a7f7/torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:110d115333524d60e9e474d53c7d20f096dbd8a080232f88dddb90566f90064c", size = 1784141 },
+    { url = "https://files.pythonhosted.org/packages/6e/1b/28f527b22d5e8800184d0bc847f801ae92c7573a8c15979d92b7091c0751/torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:97a5814a93c793aaf0179cfc7f916024f4b63218929aee977b645633d074a49f", size = 1784140 },
+    { url = "https://files.pythonhosted.org/packages/f9/56/47d456b61c3bbce7bed4af3925c83d405bb87468e659fd3cf3d9840c3b51/torchvision-0.21.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:659b76c86757cb2ee4ca2db245e0740cfc3081fef46f0f1064d11adb4a8cee31", size = 1784141 },
+    { url = "https://files.pythonhosted.org/packages/49/d5/d18c5d89cbe32015b033f1fa06918c7cdd5c0af0c03e55d72a3cc2d768f8/torchvision-0.21.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c22caeaae8b3c36d93459f1a5294e6f43306cff856ed243189a229331a404b4", size = 1784154 },
+]
+
+[[package]]
+name = "torchvision"
+version = "0.21.0+cu124"
+source = { registry = "https://download.pytorch.org/whl/cu124" }
+resolution-markers = [
+    "(python_full_version >= '3.12' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform == 'win32')",
+    "(python_full_version == '3.11.*' and sys_platform == 'linux') or (python_full_version == '3.11.*' and sys_platform == 'win32')",
+    "(python_full_version == '3.10.*' and sys_platform == 'linux') or (python_full_version == '3.10.*' and sys_platform == 'win32')",
+    "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and sys_platform == 'linux') or (python_full_version < '3.10' and sys_platform == 'win32')" },
+    { name = "numpy", version = "2.2.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and sys_platform == 'linux') or (python_full_version >= '3.10' and sys_platform == 'win32')" },
+    { name = "pillow", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp310-cp310-linux_x86_64.whl", hash = "sha256:3d3e74018eaa7837c73e3764dad3b7792b7544401c25a42977e9744303731bd3" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp310-cp310-win_amd64.whl", hash = "sha256:0c6aefb70ab2b312065240c804e459ac7b0e449867afd469b38d2fd47f9391a7" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl", hash = "sha256:137376805aca5ba57bd2c7a3ecb8569df961dbe82b128aac9b3b0a7125ef9385" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-win_amd64.whl", hash = "sha256:000a013584ad2304ab30496318145f284ac364622addb5ee3a5abd2769ba146f" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-linux_x86_64.whl", hash = "sha256:efb53ea0af7bf09b7b53e2a18b9be6d245f7d46a90b51d5cf97f37e9b929a991" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp312-cp312-win_amd64.whl", hash = "sha256:ec63c2ee792757492da40590e34b14f2fceda29050558c215f0c1f3b08149c0f" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp313-cp313-linux_x86_64.whl", hash = "sha256:4b70acf3b4b96a0ceb1374116626c9bef9e8be016b57b1284e482260ca1896d6" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp313-cp313-win_amd64.whl", hash = "sha256:8fcf55321b206de70ff8e01c884fa42e57a60b1cb749341b96e0f22c8a7c9ec7" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp39-cp39-linux_x86_64.whl", hash = "sha256:6afb21a22f5497e08ea4dbd4544472330d8249bf09dafd239302552cad6906b2" },
+    { url = "https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp39-cp39-win_amd64.whl", hash = "sha256:579b6a7fffc34a860c57a7131221ef125831f5961431f8da15760ab1ef752d44" },
 ]

 [[package]]
@ -2792,7 +2919,7 @@ wheels = [

 [[package]]
 name = "transformers"
-version = "4.49.0"
+version = "4.51.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock" },
@ -2807,9 +2934,9 @@ dependencies = [
    { name = "tokenizers" },
    { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/79/50/46573150944f46df8ec968eda854023165a84470b42f69f67c7d475dabc5/transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e", size = 8610952 }
+sdist = { url = "https://files.pythonhosted.org/packages/38/75/6ebdae4d6f4574f47139a070445245537e43482d006f615af8e23d5bf05e/transformers-4.51.0.tar.gz", hash = "sha256:2d302563ff6c2cc2d0e88ef352cf059f9a21ce18102fd43662bb1246f70b8a84", size = 8925571 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/37/1f29af63e9c30156a3ed6ebc2754077016577c094f31de7b2631e5d379eb/transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03", size = 9970275 },
+    { url = "https://files.pythonhosted.org/packages/6f/db/7ee15028d5130929aa0b1b85bab6d8bafe806254d3b5c56c42a0066cceb8/transformers-4.51.0-py3-none-any.whl", hash = "sha256:2e6baa476735ab8adccbaee6961525a0d1ce8c21d49293af30ef5ee4b082f64d", size = 10362017 },
 ]

 [[package]]