Merge pull request #187 from yuanwu2017/v2.0.4

2025-09-12 12:54:52 +00:00 · 2024-08-12 23:59:03 +02:00 · 2024-08-12 23:59:03 +02:00 · c09f5bc930
commit c09f5bc930
parent d403575c43 a41e974c3b
151 changed files with 12972 additions and 3326 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,3 +13,4 @@ server/exllama_kernels/exllama_kernels/hip_buffers.cuh
 server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
 data/
 load_tests/*.json
--- a/Cargo.lock
+++ b/Cargo.lock
@ -78,9 +78,9 @@ dependencies = [
 [[package]]
 name = "anstyle-query"
-version = "1.1.0"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
+checksum = "a64c907d4e79225ac72e2a354c9ce84d50ebb4586dee56c82b3ee73004f537f5"
 dependencies = [
 "windows-sys 0.52.0",
 ]
@ -3552,7 +3552,7 @@ dependencies = [
 [[package]]
 name = "text-generation-benchmark"
-version = "2.0.1"
+version = "2.0.4"
 dependencies = [
 "average",
 "clap",
@ -3573,7 +3573,7 @@ dependencies = [
 [[package]]
 name = "text-generation-client"
-version = "2.0.1"
+version = "2.0.4"
 dependencies = [
 "futures",
 "grpc-metadata",
@ -3590,7 +3590,7 @@ dependencies = [
 [[package]]
 name = "text-generation-launcher"
-version = "2.0.1"
+version = "2.0.4"
 dependencies = [
 "clap",
 "ctrlc",
@ -3601,6 +3601,7 @@ dependencies = [
 "reqwest",
 "serde",
 "serde_json",
 "thiserror",
 "tracing",
 "tracing-subscriber",
 "vergen",
@ -3608,7 +3609,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router"
-version = "2.0.1"
+version = "2.0.4"
 dependencies = [
 "async-stream",
 "axum",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,7 +9,7 @@ members = [
 resolver = "2"
 [workspace.package]
-version = "2.0.2"
+version = "2.0.4"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/3
+++ b/3
@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
 WORKDIR /usr/src
 FROM chef as planner
@ -50,6 +50,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        ca-certificates \
        make \
        curl \
        git \
        && rm -rf /var/lib/apt/lists/*
 # Install server
--- a/76
+++ b/76
@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
 WORKDIR /usr/src
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@ -36,7 +36,7 @@ COPY launcher launcher
 RUN cargo build --release
 # Text Generation Inference base image for RoCm
-FROM rocm/dev-ubuntu-22.04:5.7 as base
+FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update as base
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    build-essential \
@ -50,13 +50,24 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
    # Needed to build VLLM & flash.
    rocthrust-dev \
    hipsparse-dev \
-    hipblas-dev && \
+    hipblas-dev \
    hipblaslt-dev \
    rocblas-dev \
    hiprand-dev \
    rocrand-dev \
    miopen-hip-dev \
    hipfft-dev \
    hipcub-dev \
    hipsolver-dev \
    rccl-dev \
    cmake \
    python3-dev && \
    rm -rf /var/lib/apt/lists/*
 # Keep in sync with `server/pyproject.toml
 ARG MAMBA_VERSION=23.1.0-1
-ARG PYTORCH_VERSION='2.2.0.dev0'
+ARG PYTORCH_VERSION='2.3.0'
-ARG ROCM_VERSION='5.7'
+ARG ROCM_VERSION='6.0.2'
 ARG PYTHON_VERSION='3.10.10'
 # Automatically set by buildx
 ARG TARGETPLATFORM
@ -75,12 +86,44 @@ RUN chmod +x ~/mambaforge.sh && \
    mamba init && \
    rm ~/mambaforge.sh
-# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
+# Install flash-attention, torch dependencies
-RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/
+RUN pip install numpy einops ninja --no-cache-dir
 RUN conda install intel::mkl-static intel::mkl-include
 RUN pip uninstall -y triton && \
    git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
    cd triton/python && \
    pip install .
 RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
 ARG _GLIBCXX_USE_CXX11_ABI="1"
 ARG CMAKE_PREFIX_PATH="/opt/conda"
 ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 ARG BUILD_CAFFE2="0" \
    BUILD_CAFFE2_OPS="0" \
    USE_CUDA="0" \
    USE_ROCM="1" \
    BUILD_TEST="0" \
    USE_FBGEMM="0" \
    USE_NNPACK="0" \
    USE_QNNPACK="0" \
    USE_XNNPACK="0" \
    USE_FLASH_ATTENTION="1" \
    USE_MEM_EFF_ATTENTION="0"
 RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
 # Set as recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
 ENV HIP_FORCE_DEV_KERNARG=1
 # On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
 # However, Triton requires a tunning for each prompt length, which is prohibitive.
 ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
 FROM base AS kernel-builder
-# Build vllm kernels
+# # Build vllm kernels
 FROM kernel-builder AS vllm-builder
 WORKDIR /usr/src
@ -102,21 +145,21 @@ RUN make build-flash-attention-v2-rocm
 FROM kernel-builder as custom-kernels-builder
 WORKDIR /usr/src
 COPY server/custom_kernels/ .
-RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build
+RUN python setup.py build
 # Build exllama kernels
 FROM kernel-builder as exllama-kernels-builder
 WORKDIR /usr/src
 COPY server/exllama_kernels/ .
-RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
+RUN python setup.py build
 # Build exllama v2 kernels
 FROM kernel-builder as exllamav2-kernels-builder
 WORKDIR /usr/src
 COPY server/exllamav2_kernels/ .
-RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
+RUN python setup.py build
 FROM base as base-copy
@ -140,9 +183,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310
 # Copy build artifacts from exllamav2 kernels builder
 COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
 # Install server
 COPY proto proto
 COPY server server
@ -160,7 +200,8 @@ COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bi
 COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
 # AWS Sagemaker compatible image
-FROM base-copy as sagemaker
+FROM base as sagemaker
 COPY sagemaker-entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
@ -169,5 +210,8 @@ ENTRYPOINT ["./entrypoint.sh"]
 # Final image
 FROM base-copy
-ENTRYPOINT ["text-generation-launcher"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 ENTRYPOINT ["/tgi-entrypoint.sh"]
 CMD ["--json-output"]
--- a/26
+++ b/26
@ -1,4 +1,4 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
 WORKDIR /usr/src
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@ -36,18 +36,19 @@ RUN cargo build --release
 # Text Generation Inference base image for Intel
-FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base
+FROM intel/intel-extension-for-pytorch:2.1.30-xpu as base
 USER root
 # libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
 RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
 RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+RUN apt-get update && apt install -y intel-basekit xpu-smi
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
@ -56,9 +57,8 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
 WORKDIR /usr/src
-# Build pytorch and ipex
+RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
-RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main
+RUN pip install intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
 RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch
 # Install server
 COPY proto proto
@ -72,25 +72,11 @@ RUN cd server && \
 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
 ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
 ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh
 ENV CCL_CONFIGURATION=cpu_gpu_dpcpp
 ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man
 ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
 ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
 ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
 ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so
 ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar
 ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
 ENV MKLROOT=/opt/intel/oneapi/mkl/latest
 ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N
 ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
 ENV CCL_ZE_IPC_EXCHANGE=sockets
 RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install
 RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install
 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
--- a/assets/tgi_grafana.json
+++ b/assets/tgi_grafana.json
--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@ -11,7 +11,7 @@ pub(crate) enum Event {
    /// Key press.
    Key(event::KeyEvent),
    /// Terminal resize.
-    Resize(u16, u16),
+    Resize,
 }
 pub(crate) async fn terminal_event_task(
@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
        if event::poll(Duration::from_secs(0)).expect("no events available") {
            match event::read().expect("unable to read event") {
                event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
-                event::Event::Resize(w, h) => {
+                event::Event::Resize(_w, _h) => {
-                    event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
+                    event_sender.send(Event::Resize).await.unwrap_or(())
                }
                _ => (),
            }
--- a/clients/python/text_generation/init.py
+++ b/clients/python/text_generation/init.py
@ -14,5 +14,10 @@
 __version__ = "0.6.0"
 DEPRECATION_WARNING = (
    "`text_generation` clients are deprecated and will be removed in the near future. "
    "Please use the `InferenceClient` from the `huggingface_hub` package instead."
 )
 from text_generation.client import Client, AsyncClient
 from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@ -1,16 +1,21 @@
 import json
 import requests
 import warnings
 from aiohttp import ClientSession, ClientTimeout
 from pydantic import ValidationError
 from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
 from text_generation import DEPRECATION_WARNING
 from text_generation.types import (
    StreamResponse,
    Response,
    Request,
    Parameters,
    Grammar,
    CompletionRequest,
    Completion,
    CompletionComplete,
    ChatRequest,
    ChatCompletionChunk,
    ChatComplete,
@ -19,6 +24,9 @@ from text_generation.types import (
 )
 from text_generation.errors import parse_error
 # emit deprecation warnings
 warnings.simplefilter("always", DeprecationWarning)
 class Client:
    """Client to make calls to a text-generation-inference instance
@ -59,11 +67,100 @@ class Client:
            timeout (`int`):
                Timeout in seconds
        """
        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
        self.timeout = timeout
    def completion(
        self,
        prompt: str,
        frequency_penalty: Optional[float] = None,
        max_tokens: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stream: bool = False,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        stop: Optional[List[str]] = None,
    ):
        """
        Given a prompt, generate a response synchronously
        Args:
            prompt (`str`):
                Prompt
            frequency_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            max_tokens (`int`):
                Maximum number of generated tokens
            repetition_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            seed (`int`):
                Random sampling seed
            stream (`bool`):
                Stream the response
            temperature (`float`):
                The value used to module the logits distribution.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated
        """
        request = CompletionRequest(
            model="tgi",
            prompt=prompt,
            frequency_penalty=frequency_penalty,
            max_tokens=max_tokens,
            repetition_penalty=repetition_penalty,
            seed=seed,
            stream=stream,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        )
        if not stream:
            resp = requests.post(
                f"{self.base_url}/v1/completions",
                json=request.dict(),
                headers=self.headers,
                cookies=self.cookies,
                timeout=self.timeout,
            )
            payload = resp.json()
            if resp.status_code != 200:
                raise parse_error(resp.status_code, payload)
            return Completion(**payload)
        else:
            return self._completion_stream_response(request)
    def _completion_stream_response(self, request):
        resp = requests.post(
            f"{self.base_url}/v1/completions",
            json=request.dict(),
            headers=self.headers,
            cookies=self.cookies,
            timeout=self.timeout,
            stream=True,
        )
        # iterate and print stream
        for byte_payload in resp.iter_lines():
            if byte_payload == b"\n":
                continue
            payload = byte_payload.decode("utf-8")
            if payload.startswith("data:"):
                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
                try:
                    response = CompletionComplete(**json_payload)
                    yield response
                except ValidationError:
                    raise parse_error(resp.status, json_payload)
    def chat(
        self,
        messages: List[Message],
@ -82,6 +179,7 @@ class Client:
        tools: Optional[List[Tool]] = None,
        tool_prompt: Optional[str] = None,
        tool_choice: Optional[str] = None,
        stop: Optional[List[str]] = None,
    ):
        """
        Given a list of messages, generate a response asynchronously
@ -124,6 +222,8 @@ class Client:
                A prompt to be appended before the tools
            tool_choice (`str`):
                The tool to use
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated
        """
        request = ChatRequest(
@ -144,6 +244,7 @@ class Client:
            tools=tools,
            tool_prompt=tool_prompt,
            tool_choice=tool_choice,
            stop=stop,
        )
        if not stream:
            resp = requests.post(
@ -449,11 +550,99 @@ class AsyncClient:
            timeout (`int`):
                Timeout in seconds
        """
        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
        self.timeout = ClientTimeout(timeout)
    async def completion(
        self,
        prompt: str,
        frequency_penalty: Optional[float] = None,
        max_tokens: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
        seed: Optional[int] = None,
        stream: bool = False,
        temperature: Optional[float] = None,
        top_p: Optional[float] = None,
        stop: Optional[List[str]] = None,
    ) -> Union[Completion, AsyncIterator[CompletionComplete]]:
        """
        Given a prompt, generate a response asynchronously
        Args:
            prompt (`str`):
                Prompt
            frequency_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty
                Penalize new tokens based on their existing frequency in the text so far,
                decreasing the model's likelihood to repeat the same line verbatim.
            max_tokens (`int`):
                Maximum number of generated tokens
            repetition_penalty (`float`):
                The parameter for frequency penalty. 0.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
            seed (`int`):
                Random sampling seed
            stream (`bool`):
                Stream the response
            temperature (`float`):
                The value used to module the logits distribution.
            top_p (`float`):
                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                higher are kept for generation
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated
        """
        request = CompletionRequest(
            model="tgi",
            prompt=prompt,
            frequency_penalty=frequency_penalty,
            max_tokens=max_tokens,
            repetition_penalty=repetition_penalty,
            seed=seed,
            stream=stream,
            temperature=temperature,
            top_p=top_p,
            stop=stop,
        )
        if not stream:
            return await self._completion_single_response(request)
        else:
            return self._completion_stream_response(request)
    async def _completion_single_response(self, request):
        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(
                f"{self.base_url}/v1/completions", json=request.dict()
            ) as resp:
                payload = await resp.json()
                if resp.status != 200:
                    raise parse_error(resp.status, payload)
                return Completion(**payload)
    async def _completion_stream_response(self, request):
        async with ClientSession(
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(
                f"{self.base_url}/v1/completions", json=request.dict()
            ) as resp:
                async for byte_payload in resp.content:
                    if byte_payload == b"\n":
                        continue
                    payload = byte_payload.decode("utf-8")
                    if payload.startswith("data:"):
                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
                        try:
                            response = CompletionComplete(**json_payload)
                            yield response
                        except ValidationError:
                            raise parse_error(resp.status, json_payload)
    async def chat(
        self,
        messages: List[Message],
@ -472,6 +661,7 @@ class AsyncClient:
        tools: Optional[List[Tool]] = None,
        tool_prompt: Optional[str] = None,
        tool_choice: Optional[str] = None,
        stop: Optional[List[str]] = None,
    ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
        """
        Given a list of messages, generate a response asynchronously
@ -514,6 +704,8 @@ class AsyncClient:
                A prompt to be appended before the tools
            tool_choice (`str`):
                The tool to use
            stop (`List[str]`):
                Stop generating tokens if a member of `stop` is generated
        """
        request = ChatRequest(
@ -534,6 +726,7 @@ class AsyncClient:
            tools=tools,
            tool_prompt=tool_prompt,
            tool_choice=tool_choice,
            stop=stop,
        )
        if not stream:
            return await self._chat_single_response(request)
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@ -46,30 +46,6 @@ class Tool(BaseModel):
    function: dict
 class ChatCompletionComplete(BaseModel):
    # Index of the chat completion
    index: int
    # Message associated with the chat completion
    message: Message
    # Log probabilities for the chat completion
    logprobs: Optional[Any]
    # Reason for completion
    finish_reason: str
    # Usage details of the chat completion
    usage: Optional[Any] = None
 class CompletionComplete(BaseModel):
    # Index of the chat completion
    index: int
    # Message associated with the chat completion
    text: str
    # Log probabilities for the chat completion
    logprobs: Optional[Any]
    # Reason for completion
    finish_reason: str
 class Function(BaseModel):
    name: Optional[str]
    arguments: str
@ -95,24 +71,41 @@ class Choice(BaseModel):
    finish_reason: Optional[str] = None
-class ChatCompletionChunk(BaseModel):
+class CompletionRequest(BaseModel):
-    id: str
+    # Model identifier
    object: str
    created: int
    model: str
-    system_fingerprint: str
+    # Prompt
-    choices: List[Choice]
+    prompt: str
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float] = None
    # The parameter for frequency penalty. 1.0 means no penalty
    # Penalize new tokens based on their existing frequency in the text so far,
    # decreasing the model's likelihood to repeat the same line verbatim.
    frequency_penalty: Optional[float] = None
    # Maximum number of tokens to generate
    max_tokens: Optional[int] = None
    # Flag to indicate streaming response
    stream: bool = False
    # Random sampling seed
    seed: Optional[int] = None
    # Sampling temperature
    temperature: Optional[float] = None
    # Top-p value for nucleus sampling
    top_p: Optional[float] = None
    # Stop generating tokens if a member of `stop` is generated
    stop: Optional[List[str]] = None
-class ChatComplete(BaseModel):
+class CompletionComplete(BaseModel):
-    # Chat completion details
+    # Index of the chat completion
-    id: str
+    index: int
-    object: str
+    # Message associated with the chat completion
-    created: int
+    text: str
-    model: str
+    # Log probabilities for the chat completion
-    system_fingerprint: str
+    logprobs: Optional[Any]
-    choices: List[ChatCompletionComplete]
+    # Reason for completion
-    usage: Any
+    finish_reason: str
 class Completion(BaseModel):
@ -163,6 +156,41 @@ class ChatRequest(BaseModel):
    tool_prompt: Optional[str] = None
    # Choice of tool to be used
    tool_choice: Optional[str] = None
    # Stop generating tokens if a member of `stop` is generated
    stop: Optional[List[str]] = None
 class ChatCompletionComplete(BaseModel):
    # Index of the chat completion
    index: int
    # Message associated with the chat completion
    message: Message
    # Log probabilities for the chat completion
    logprobs: Optional[Any]
    # Reason for completion
    finish_reason: str
    # Usage details of the chat completion
    usage: Optional[Any] = None
 class ChatComplete(BaseModel):
    # Chat completion details
    id: str
    object: str
    created: int
    model: str
    system_fingerprint: str
    choices: List[ChatCompletionComplete]
    usage: Any
 class ChatCompletionChunk(BaseModel):
    id: str
    object: str
    created: int
    model: str
    system_fingerprint: str
    choices: List[Choice]
 class Parameters(BaseModel):
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -1121,6 +1121,15 @@
            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
            "example": 0.95,
            "nullable": true
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "Up to 4 sequences where the API will stop generating further tokens.",
            "example": "null",
            "nullable": true
          }
        }
      },
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -3,8 +3,16 @@
    title: Text Generation Inference
  - local: quicktour
    title: Quick Tour
  - local: installation_nvidia
    title: Using TGI with Nvidia GPUs
  - local: installation_amd
    title: Using TGI with AMD GPUs
  - local: installation_gaudi
    title: Using TGI with Intel Gaudi
  - local: installation_inferentia
    title: Using TGI with AWS Inferentia
  - local: installation
-    title: Installation
+    title: Installation from source
  - local: supported_models
    title: Supported Models and Hardware
  - local: messages_api
@ -20,7 +28,7 @@
  - local: basic_tutorials/using_cli
    title: Using TGI CLI
  - local: basic_tutorials/launcher
-    title: All TGI CLI  options
+    title: All TGI CLI options
  - local: basic_tutorials/non_core_models
    title: Non-core Model Serving
  - local: basic_tutorials/safety
@ -29,6 +37,10 @@
    title: Using Guidance, JSON, tools
  - local: basic_tutorials/visual_language_models
    title: Visual Language Models
  - local: basic_tutorials/monitoring
    title: Monitoring TGI with Prometheus and Grafana
  - local: basic_tutorials/train_medusa
    title: Train Medusa
  title: Tutorials
 - sections:
  - local: conceptual/streaming
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HUGGING_FACE_HUB_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.3 \
    --model-id $model
 ```
--- a/docs/source/basic_tutorials/monitoring.md
+++ b/docs/source/basic_tutorials/monitoring.md
@ -0,0 +1,75 @@
 # Monitoring TGI server with Prometheus and Grafana dashboard
 TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
 In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
 ![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)
 ## Setup on the server machine
 First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
 In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
 On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
 ```
 wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
 tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
 cd prometheus
 ```
 Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
 ```
    static_configs:
      - targets: ["0.0.0.0:80"]
 ```
 to use the correct IP address and port.
 We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
 Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
 ```
 ./prometheus --config.file="prometheus.yml"
 ```
 In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
 * Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
 * Use ngrok port tunneling
 For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
 For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
 ```bash
 ngrok http http://0.0.0.0:9090
 ```
 As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
 ## Setup on the monitoring machine
 Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
 Two options are available:
 * Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
 * Self-host a grafana dashboard.
 In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
 ```bash
 wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
 tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
 cd grafana-11.0.0
 ./bin/grafana-server
 ```
 Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
 Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
 Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
 Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
 Load your dashboard configuration, and your TGI dashboard should be ready to go!
--- a/docs/source/basic_tutorials/train_medusa.md
+++ b/docs/source/basic_tutorials/train_medusa.md
@ -0,0 +1,208 @@
 # Train Medusa
 This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation.md) for more information on how Medusa works and speculation in general.
 ## What are the benefits of training a Medusa model?
 Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.
 One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.
 If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.
 ## Self-distillation (Generating data for training)
 There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.
 Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.
 We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.
 ## Training
 The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.
 ### Getting Started
 There are two methods for training the model:
 - `torchrun` that is a wrapper around `torch.distributed.launch`
 - a forked version of `axlotl` that supports Medusa
 In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.
 ### Training with `torchrun`
 ```bash
 mkdir medusa-training
 cd medusa-training
 pyenv install 3.10
 pyenv local 3.10
 uv venv -p 3.10
 source .venv/bin/activate
 ```
 Now lets clone the original `Medusa` repository and install the library.
 ```bash
 git clone https://github.com/FasterDecoding/Medusa.git
 cd Medusa
 pip install -e .
 ```
 Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.
 ```bash
 apt install git-lfs
 git lfs install
 git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
 ```
 Currently our directory structure looks like this:
 ```bash
 .
 ├── assets
 ├── CITATION.cff
 ├── create_data.py
 ├── data_generation
 ├── deepspeed.json
 ├── last_run_prepared
 ├── LICENSE
 ├── llm_judge
 ├── medusa
 ├── medusa_llm.egg-info
 ├── mistral.json
 ├── notebooks
 ├── pyproject.toml
 ├── README.md
 ├── ROADMAP.md
 ├── scripts
 ├── ShareGPT_Vicuna_unfiltered
 │   ├── README.md
 │   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
 │   └── ShareGPT_V4.3_unfiltered_cleaned_split.json
 ├── simple_gradio_interface.py
 ├── tiny-llama.json
 └── vicuna_7b_qlora_stage1
 ```
 ## Start Training
 Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.
 First make sure you have an instance of TGI running with the model you want to use for self-distillation.
 ```bash
 model=HuggingFaceH4/zephyr-7b-beta
 volume=/home/ubuntu/.cache/huggingface/hub/
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
 ```
 Now we can generate the data using the `create_data.py` script.
 ```bash
 python create_data.py \
    --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
    --output-filename zephyr_self_distill.json
 ```
 At this point our terminal should look like this:
 <div class="flex justify-center">
    <img
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
        width="550"
    />
 </div>
 > Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.
 Now we can finally get to the fun part and start training the model!
 Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.
 > NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.
 ```bash
 WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
    --model_name_or_path HuggingFaceH4/zephyr-7b-beta \
    --data_path zephyr_self_distill.json \
    --bf16 True \
    --output_dir zephyr_out \
    --num_train_epochs 5 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --evaluation_strategy "no" \
    --save_strategy "no" \
    --learning_rate 1e-3 \
    --weight_decay 0.0 \
    --warmup_ratio 0.1 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --tf32 True \
    --model_max_length 2048 \
    --lazy_preprocess True \
    --medusa_num_heads 3 \
    --medusa_num_layers 1 \
    --deepspeed deepspeed.json
 ```
 <div class="flex justify-center">
    <img
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
        width="550"
    />
 </div>
 If successful, you should see the similar output to the one below:
 ```bash
 wandb: Run history:
 wandb:                    train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
 wandb:              train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
 wandb:            train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
 wandb:                     train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
 wandb:             train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
 wandb:             train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
 wandb:             train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
 wandb:             train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
 wandb:             train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
 wandb:             train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
 wandb:               train/total_flos ▁
 wandb:               train/train_loss ▁
 wandb:            train/train_runtime ▁
 wandb: train/train_samples_per_second ▁
 wandb:   train/train_steps_per_second ▁
 wandb:
 wandb: Run summary:
 wandb:                    train/epoch 2.0
 wandb:              train/global_step 16
 wandb:            train/learning_rate 0.0
 wandb:                     train/loss 14.8906
 wandb:             train/medusa0_loss 4.25
 wandb:             train/medusa0_top1 0.28809
 wandb:             train/medusa1_loss 4.8125
 wandb:             train/medusa1_top1 0.22727
 wandb:             train/medusa2_loss 5.5
 wandb:             train/medusa2_top1 0.17293
 wandb:               train/total_flos 0.0
 wandb:               train/train_loss 23.98242
 wandb:            train/train_runtime 396.9266
 wandb: train/train_samples_per_second 2.519
 wandb:   train/train_steps_per_second 0.04
 ```
 Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.
 ```bash
 python -m medusa.hf_utils \
    --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
    --repo drbh/zephyr_medusa_demo
 ```
 Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉
--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@ -2,7 +2,7 @@
 Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
-These feature are available starting from version `1.4.3`. They are accessible via the [text_generation](https://pypi.org/project/text-generation/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
+These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
 _note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
@ -74,6 +74,45 @@ curl localhost:3000/generate \
 ```
 ### Hugging Face Hub Python Library
 The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
 ```python
 from huggingface_hub import InferenceClient
 client = InferenceClient("http://localhost:3000")
 schema = {
    "properties": {
        "location": {"title": "Location", "type": "string"},
        "activity": {"title": "Activity", "type": "string"},
        "animals_seen": {
            "maximum": 5,
            "minimum": 1,
            "title": "Animals Seen",
            "type": "integer",
        },
        "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
    },
    "required": ["location", "activity", "animals_seen", "animals"],
    "title": "Animals",
    "type": "object",
 }
 user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
 resp = client.text_generation(
    f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
    max_new_tokens=100,
    seed=42,
    grammar={"type": "json", "value": schema},
 )
 print(resp)
 # { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
 ```
 A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
 > Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
@ -83,134 +122,55 @@ A grammar can be defined using Pydantic models, JSON schemas, or regular express
 Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
 ```python
-import requests
+from huggingface_hub import InferenceClient
 from pydantic import BaseModel, conint
 from typing import List
 class Animals(BaseModel):
    location: str
    activity: str
    animals_seen: conint(ge=1, le=5)  # Constrained integer type
    animals: List[str]
 prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park"
-data = {
+client = InferenceClient("http://localhost:3000")
    "inputs": prompt,
    "parameters": {
        "repetition_penalty": 1.3,
        "grammar": {
            "type": "json",
            "value": Animals.schema()
        }
    }
 }
-headers = {
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
-    "Content-Type": "application/json",
+resp = client.text_generation(
-}
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
-
+    max_new_tokens=100,
-response = requests.post(
+    seed=42,
-    'http://127.0.0.1:3000/generate',
+    grammar={"type": "json", "value": Animals.schema()},
    headers=headers,
    json=data
 )
-print(response.json())
+
-# {'generated_text': '{ "activity": "bike riding", "animals": ["puppy","cat","raccoon"],"animals_seen": 3, "location":"park" }'}
+print(resp)
 # { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
 ```
-### JSON Schema Integration
+defining a grammar as regular expressions
 If Pydantic's not your style, go raw with direct JSON Schema integration. This is similar to the first example but with programmatic control.
 ```python
-import requests
+from huggingface_hub import InferenceClient
-json_schema = {
+client = InferenceClient("http://localhost:3000")
-    "properties": {
+
-        "location": {
+regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
-            "type": "string"
+
-        },
+resp = client.text_generation(
-        "activity": {
+    f"Whats Googles DNS? Please use the following regex: {regexp}",
-            "type": "string"
+    seed=42,
-        },
+    grammar={
-        "animals_seen": {
+        "type": "regex",
-            "type": "integer",
+        "value": regexp,
            "minimum": 1,
            "maximum": 5
        },
        "animals": {
            "type": "array",
            "items": {
                "type": "string"
            }
        }
    },
    "required": ["location", "activity", "animals_seen", "animals"]
 }
 data = {
    "inputs": "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park",
    "parameters": {
        "max_new_tokens": 200,
        "repetition_penalty": 1.3,
        "grammar": {
            "type": "json",
            "value": json_schema
        }
    }
 }
 headers = {
    "Content-Type": "application/json",
 }
 response = requests.post(
    'http://127.0.0.1:3000/generate',
    headers=headers,
    json=data
 )
 print(response.json())
 # {'generated_text': '{\n"activity": "biking",\n"animals": ["puppy","cat","raccoon"]\n  , "animals_seen": 3,\n   "location":"park"}'}
 ```
-### Using the client
+print(resp)
-
+# 7.1.1.1
 TGI provides a client library to that make it easy to send requests with all of the parameters we've discussed above. Here's an example of how to use the client to send a request with a grammar parameter.
 ```python
 from text_generation import AsyncClient
 from text_generation.types import GrammarType
 # NOTE: tools defined above and removed for brevity
 # Define an async function to encapsulate the async operation
 async def main():
    client = AsyncClient(base_url="http://localhost:3000")
    # Use 'await' to wait for the async method 'chat' to complete
    response = await client.generate(
        "Whats Googles DNS",
        max_new_tokens=10,
        decoder_input_details=True,
        seed=1,
        grammar={
            "type": GrammarType.Regex,
            "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
        },
    )
    # Once the response is received, you can process it
    print(response.generated_text)
 # Ensure the main async function is run in the event loop
 if __name__ == "__main__":
    import asyncio
    asyncio.run(main())
 # 118.8.0.84
 ```
@ -265,107 +225,87 @@ curl localhost:3000/v1/chat/completions \
 // {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
 ```
-### Text Generation Inference Client
+### Chat Completion with Tools
-TGI provides a client library to interact with the Messages API and Tool functions. The client library is available in both synchronous and asynchronous versions.
+Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
 ```python
-from text_generation import AsyncClient
+from huggingface_hub import InferenceClient
-# NOTE: tools defined above and removed for brevity
+client = InferenceClient("http://localhost:3000")
-# Define an async function to encapsulate the async operation
+tools = [
-async def main():
+    {
-    client = AsyncClient(base_url="http://localhost:3000")
+        "type": "function",
-
+        "function": {
-    # Use 'await' to wait for the async method 'chat' to complete
+            "name": "get_current_weather",
-    response = await client.chat(
+            "description": "Get the current weather",
-        max_tokens=100,
+            "parameters": {
-        seed=1,
+                "type": "object",
-        tools=tools,
+                "properties": {
-        presence_penalty=-1.1,
+                    "location": {
-        messages=[
+                        "type": "string",
-            {
+                        "description": "The city and state, e.g. San Francisco, CA",
-                "role": "system",
+                    },
-                "content": "You're a helpful assistant! Answer the users question best you can.",
+                    "format": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use. Infer this from the users location.",
                    },
                },
                "required": ["location", "format"],
            },
-            {
+        },
-                "role": "user",
+    },
-                "content": "What is the weather like in Brooklyn, New York?",
+    {
        "type": "function",
        "function": {
            "name": "get_n_day_weather_forecast",
            "description": "Get an N-day weather forecast",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "format": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "The temperature unit to use. Infer this from the users location.",
                    },
                    "num_days": {
                        "type": "integer",
                        "description": "The number of days to forecast",
                    },
                },
                "required": ["location", "format", "num_days"],
            },
-        ],
+        },
-    )
+    },
 ]
-    # Once the response is received, you can process it
+chat = client.chat_completion(
-    print(response.choices[0].message.tool_calls)
+    messages=[
        {
            "role": "system",
            "content": "You're a helpful assistant! Answer the users question best you can.",
        },
        {
            "role": "user",
            "content": "What is the weather like in Brooklyn, New York?",
        },
    ],
    tools=tools,
    seed=42,
    max_tokens=100,
 )
-# Ensure the main async function is run in the event loop
+print(chat.choices[0].message.tool_calls)
-if __name__ == "__main__":
+# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
    import asyncio
    asyncio.run(main())
 # {"id":"","object":"text_completion","created":1709051942,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":20,"total_tokens":177}}
 ```
 <details>
  <summary>Tools used in example above</summary>
 ```python
  tools = [
      {
          "type": "function",
          "function": {
              "name": "get_current_weather",
              "description": "Get the current weather",
              "parameters": {
                  "type": "object",
                  "properties": {
                      "location": {
                          "type": "string",
                          "description": "The city and state, e.g. San Francisco, CA",
                      },
                      "format": {
                          "type": "string",
                          "enum": ["celsius", "fahrenheit"],
                          "description": "The temperature unit to use. Infer this from the users location.",
                      },
                  },
                  "required": ["location", "format"],
              },
          },
      },
      {
          "type": "function",
          "function": {
              "name": "get_n_day_weather_forecast",
              "description": "Get an N-day weather forecast",
              "parameters": {
                  "type": "object",
                  "properties": {
                      "location": {
                          "type": "string",
                          "description": "The city and state, e.g. San Francisco, CA",
                      },
                      "format": {
                          "type": "string",
                          "enum": ["celsius", "fahrenheit"],
                          "description": "The temperature unit to use. Infer this from the users location.",
                      },
                      "num_days": {
                          "type": "integer",
                          "description": "The number of days to forecast",
                      },
                  },
                  "required": ["location", "format", "num_days"],
              },
          },
      }
  ]
 ```
 </details>
 ### OpenAI integration
 TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
--- a/docs/source/basic_tutorials/visual_language_models.md
+++ b/docs/source/basic_tutorials/visual_language_models.md
@ -53,7 +53,67 @@ for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
 # This is a picture of an anthropomorphic rabbit in a space suit.
 ```
-If you want additional details, you can add `details=True`. In this case, you get a `TextGenerationStreamResponse` which contains additional information such as the probabilities and the tokens. For the final response in the stream, it also returns the full generated text.
+or via the `chat_completion` endpoint:
 ```python
 from huggingface_hub import InferenceClient
 client = InferenceClient("http://127.0.0.1:3000")
 chat = client.chat_completion(
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Whats in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                    },
                },
            ],
        },
    ],
    seed=42,
    max_tokens=100,
 )
 print(chat)
 # ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
 ```
 or with OpenAi's library:
 ```python
 from openai import OpenAI
 # init the client but point it to TGI
 client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
 chat_completion = client.chat.completions.create(
    model="tgi",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Whats in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
                    },
                },
            ],
        },
    ],
    stream=False,
 )
 print(chat_completion)
 # ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
 ```
 ### Inference Through Sending `cURL` Requests
--- a/docs/source/conceptual/guidance.md
+++ b/docs/source/conceptual/guidance.md
@ -76,7 +76,7 @@ There are two main ways to use guidance; you can either use the `/generate` endp
 Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.
-Please refer to [using guidance](../basic_tutorial/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
+Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
 ### Getting the most out of guidance
--- a/docs/source/conceptual/speculation.md
+++ b/docs/source/conceptual/speculation.md
@ -27,7 +27,7 @@ You can check a few existing  fine-tunes for popular models:
 - [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
-In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa)
+In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md)
 In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,6 +1,10 @@
-# Installation
+# Installation from source
-This section explains how to install the CLI tool as well as installing TGI from source. **The strongly recommended approach is to use Docker, as it does not require much setup. Check [the Quick Tour](./quicktour) to learn how to run TGI with Docker.**
+<Tip warning={true}>
 Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker.
 </Tip>
 ## Install CLI
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -0,0 +1,38 @@
 # Using TGI with AMD GPUs
 TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs.
 On a server powered by AMD GPUs, TGI can be launched with the following command:
 ```bash
 model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
    --device=/dev/kfd --device=/dev/dri --group-add video \
    --ipc=host --shm-size 256g --net host -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:2.0.3-rocm \
    --model-id $model
 ```
 The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
 ## TunableOp
 TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt.
 Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3.
 TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container.
 ## Flash attention implementation
 Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/utils/flash_attn_triton.py).
 By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.
 ## Unsupported features
 The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
 * Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
 * Kernel for sliding window attention (Mistral)
--- a/docs/source/installation_gaudi.md
+++ b/docs/source/installation_gaudi.md
@ -0,0 +1,3 @@
 # Using TGI with Intel Gaudi
 Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index).
--- a/docs/source/installation_inferentia.md
+++ b/docs/source/installation_inferentia.md
@ -0,0 +1,3 @@
 # Using TGI with Inferentia
 Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -0,0 +1,18 @@
 # Using TGI with Nvidia GPUs
 TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it.
 For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
 TGI can be used on NVIDIA GPUs through its official docker image:
 ```bash
 model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:2.0.3 \
    --model-id $model
 ```
 The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -2,30 +2,27 @@
 The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).
-Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI. Here is an example on how to do that:
+## Launching TGI
 Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that:
 ```bash
 model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
    ghcr.io/huggingface/text-generation-inference:2.0.3 \
    --model-id $model
 ```
-<Tip warning={true}>
+### Supported hardware
-To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.
+TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
-</Tip>
+## Consuming TGI
 TGI also supports ROCm-enabled AMD GPUs (only MI210 and MI250 are tested), details are available in the [Supported Hardware section](./supported_models#supported-hardware) and [AMD documentation](https://rocm.docs.amd.com/en/latest/deploy/docker.html). To launch TGI on ROCm GPUs, please use instead:
 ```bash
 docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model
 ```
 Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
 <inferencesnippet>
 <python>
@ -91,7 +88,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:1.4 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.0.3 --help
 ```
 </Tip>
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -1,29 +1,36 @@
 # Supported Models and Hardware
 Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
 ## Supported Models
-The following models are optimized and can be served with TGI, which uses custom CUDA kernels for better inference. You can add the flag `--disable-custom-kernels` at the end of the `docker run` command if you wish to disable them.
+- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
-
+- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
- [BLOOM](https://huggingface.co/bigscience/bloom)
+- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl)
+- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
- [Galactica](https://huggingface.co/facebook/galactica-120b)
+- [Gemma](https://huggingface.co/google/gemma-7b)
- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
+- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
- [Llama](https://github.com/facebookresearch/llama)
+- [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
- [OPT](https://huggingface.co/facebook/opt-66b)
+- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
 - [SantaCoder](https://huggingface.co/bigcode/santacoder)
 - [Starcoder](https://huggingface.co/bigcode/starcoder)
 - [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
 - [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
 - [MPT](https://huggingface.co/mosaicml/mpt-30b)
 - [Llama V2](https://huggingface.co/meta-llama)
 - [Code Llama](https://huggingface.co/codellama)
 - [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
- [Phi](https://huggingface.co/microsoft/phi-2)
+- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
- [Idefics](HuggingFaceM4/idefics-9b-instruct) (Multimodal)
+- [Phi](https://huggingface.co/microsoft/phi-1_5)
- [Llava-next](llava-hf/llava-v1.6-mistral-7b-hf) (Multimodal)
+- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
 - [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
 - [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
 - [Qwen 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
 - [Opt](https://huggingface.co/facebook/opt-6.7b)
 - [T5](https://huggingface.co/google/flan-t5-xxl)
 - [Galactica](https://huggingface.co/facebook/galactica-120b)
 - [SantaCoder](https://huggingface.co/bigcode/santacoder)
 - [Bloom](https://huggingface.co/bigscience/bloom-560m)
 - [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
 - [Gpt2](https://huggingface.co/openai-community/gpt2)
 - [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
 - [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
@ -38,18 +45,4 @@ If you wish to serve a supported model that already exists on a local folder, ju
 ```bash
 text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
-``````
+```
 ## Supported Hardware
 TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
 TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention, GPTQ quantization, flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
 * Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
 * Flash [layer norm kernel](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm)
 * Kernel for sliding window attention (Mistral)
 TGI is also supported on the following AI hardware accelerators:
 - *Habana first-gen Gaudi and Gaudi2:* check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
 * *AWS Inferentia2:* check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
--- a/integration-tests/images/cow_beach.png
+++ b/integration-tests/images/cow_beach.png
--- a/integration-tests/models/snapshots/test_flash_gpt2/test_flash_gpt2.json
+++ b/integration-tests/models/snapshots/test_flash_gpt2/test_flash_gpt2.json
@ -0,0 +1,99 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 10,
    "prefill": [
      {
        "id": 2061,
        "logprob": null,
        "text": "What"
      },
      {
        "id": 318,
        "logprob": -3.1835938,
        "text": " is"
      },
      {
        "id": 2769,
        "logprob": -9.171875,
        "text": " deep"
      },
      {
        "id": 4673,
        "logprob": -1.6425781,
        "text": " learning"
      },
      {
        "id": 30,
        "logprob": -0.7314453,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 198,
        "logprob": -0.68603516,
        "special": false,
        "text": "\n"
      },
      {
        "id": 198,
        "logprob": -0.005393982,
        "special": false,
        "text": "\n"
      },
      {
        "id": 29744,
        "logprob": -0.31079102,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 4673,
        "logprob": -0.08300781,
        "special": false,
        "text": " learning"
      },
      {
        "id": 318,
        "logprob": -0.58984375,
        "special": false,
        "text": " is"
      },
      {
        "id": 257,
        "logprob": -0.953125,
        "special": false,
        "text": " a"
      },
      {
        "id": 649,
        "logprob": -2.0957031,
        "special": false,
        "text": " new"
      },
      {
        "id": 2214,
        "logprob": -1.8095703,
        "special": false,
        "text": " field"
      },
      {
        "id": 286,
        "logprob": -1.0673828,
        "special": false,
        "text": " of"
      },
      {
        "id": 2267,
        "logprob": -0.9375,
        "special": false,
        "text": " research"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "\n\nDeep learning is a new field of research"
 }
--- a/integration-tests/models/snapshots/test_flash_gpt2/test_flash_gpt2_load.json
+++ b/integration-tests/models/snapshots/test_flash_gpt2/test_flash_gpt2_load.json
@ -0,0 +1,398 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2061,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 318,
          "logprob": -3.1835938,
          "text": " is"
        },
        {
          "id": 2769,
          "logprob": -9.171875,
          "text": " deep"
        },
        {
          "id": 4673,
          "logprob": -1.6425781,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -0.7314453,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.68603516,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.005672455,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3251953,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08294678,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5854492,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9423828,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0800781,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8369141,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0683594,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9711914,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2061,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 318,
          "logprob": -3.1660156,
          "text": " is"
        },
        {
          "id": 2769,
          "logprob": -9.1796875,
          "text": " deep"
        },
        {
          "id": 4673,
          "logprob": -1.6376953,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -0.72216797,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.7089844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.0054779053,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3190918,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08319092,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5839844,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9506836,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0878906,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8496094,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0673828,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9370117,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2061,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 318,
          "logprob": -3.1660156,
          "text": " is"
        },
        {
          "id": 2769,
          "logprob": -9.1796875,
          "text": " deep"
        },
        {
          "id": 4673,
          "logprob": -1.6376953,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -0.72216797,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.7089844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.0054779053,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3190918,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08319092,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5839844,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9506836,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0878906,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8496094,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0673828,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9370117,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 10,
      "prefill": [
        {
          "id": 2061,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 318,
          "logprob": -3.1660156,
          "text": " is"
        },
        {
          "id": 2769,
          "logprob": -9.1796875,
          "text": " deep"
        },
        {
          "id": 4673,
          "logprob": -1.6376953,
          "text": " learning"
        },
        {
          "id": 30,
          "logprob": -0.72216797,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 198,
          "logprob": -0.7089844,
          "special": false,
          "text": "\n"
        },
        {
          "id": 198,
          "logprob": -0.0054779053,
          "special": false,
          "text": "\n"
        },
        {
          "id": 29744,
          "logprob": -0.3190918,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 4673,
          "logprob": -0.08319092,
          "special": false,
          "text": " learning"
        },
        {
          "id": 318,
          "logprob": -0.5839844,
          "special": false,
          "text": " is"
        },
        {
          "id": 257,
          "logprob": -0.9506836,
          "special": false,
          "text": " a"
        },
        {
          "id": 649,
          "logprob": -2.0878906,
          "special": false,
          "text": " new"
        },
        {
          "id": 2214,
          "logprob": -1.8496094,
          "special": false,
          "text": " field"
        },
        {
          "id": 286,
          "logprob": -1.0673828,
          "special": false,
          "text": " of"
        },
        {
          "id": 2267,
          "logprob": -0.9370117,
          "special": false,
          "text": " research"
        }
      ],
      "top_tokens": null
    },
    "generated_text": "\n\nDeep learning is a new field of research"
  }
 ]
--- a/integration-tests/models/snapshots/test_flash_pali_gemma/test_flash_pali_gemma.json
+++ b/integration-tests/models/snapshots/test_flash_pali_gemma/test_flash_pali_gemma.json
@ -0,0 +1,25 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 2,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 54901,
        "logprob": -0.72753906,
        "special": false,
        "text": "beach"
      },
      {
        "id": 1,
        "logprob": -0.011009216,
        "special": true,
        "text": "<eos>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "beach"
 }
--- a/integration-tests/models/test_flash_gpt2.py
+++ b/integration-tests/models/test_flash_gpt2.py
@ -0,0 +1,44 @@
 import pytest
@pytest.fixture(scope="module")
 def flash_gpt2_handle(launcher):
    with launcher("openai-community/gpt2", num_shard=2) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def flash_gpt2(flash_gpt2_handle):
    await flash_gpt2_handle.health(300)
    return flash_gpt2_handle.client
@pytest.mark.asyncio
 async def test_flash_gpt2(flash_gpt2, response_snapshot):
    response = await flash_gpt2.generate(
        "What is deep learning?",
        max_new_tokens=10,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot):
    responses = await generate_load(
        flash_gpt2,
        "What is deep learning?",
        max_new_tokens=10,
        n=4,
    )
    generated_texts = [r.generated_text for r in responses]
    assert len(generated_texts) == 4
    assert all(
        [text == generated_texts[0] for text in generated_texts]
    ), generated_texts
    assert responses == response_snapshot
--- a/integration-tests/models/test_flash_pali_gemma.py
+++ b/integration-tests/models/test_flash_pali_gemma.py
@ -0,0 +1,39 @@
 import pytest
 import requests
 import io
 import base64
@pytest.fixture(scope="module")
 def flash_pali_gemma_handle(launcher):
    with launcher(
        "google/paligemma-3b-pt-224",
        num_shard=1,
        revision="float16",
        max_input_length=4000,
        max_total_tokens=4096,
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def flash_pali_gemma(flash_pali_gemma_handle):
    await flash_pali_gemma_handle.health(300)
    return flash_pali_gemma_handle.client
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
    cow = get_cow_beach()
    inputs = f"![]({cow})Where is the cow standing?\n"
    response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)
    assert response.generated_text == "beach"
    assert response == response_snapshot
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@ -14,6 +14,7 @@ nix = { version = "0.28.0", features = ["signal"] }
 once_cell = "1.19.0"
 serde = { version = "1.0.188", features = ["derive"] }
 serde_json = "1.0.107"
 thiserror = "1.0.59"
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -18,14 +18,33 @@ use std::thread;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
 use std::{fs, io};
-use tracing_subscriber::EnvFilter;
+use thiserror::Error;
 use tracing_subscriber::{filter::LevelFilter, EnvFilter};
 mod env_runtime;
 #[derive(Deserialize)]
 struct RawConfig {
    max_position_embeddings: Option<usize>,
    n_positions: Option<usize>,
    max_seq_len: Option<usize>,
 }
 #[derive(Deserialize)]
 struct Config {
    max_position_embeddings: Option<usize>,
-    max_seq_len: Option<usize>,
+}
 impl From<RawConfig> for Config {
    fn from(other: RawConfig) -> Self {
        let max_position_embeddings = other
            .max_position_embeddings
            .or(other.max_seq_len)
            .or(other.n_positions);
        Config {
            max_position_embeddings,
        }
    }
 }
 #[derive(Clone, Copy, Debug, ValueEnum)]
@ -453,6 +472,7 @@ fn shard_manager(
    max_total_tokens: usize,
    max_batch_size: Option<usize>,
    otlp_endpoint: Option<String>,
    log_level: LevelFilter,
    status_sender: mpsc::Sender<ShardStatus>,
    shutdown: Arc<AtomicBool>,
    _shutdown_sender: mpsc::Sender<()>,
@ -475,7 +495,7 @@ fn shard_manager(
        "--uds-path".to_string(),
        uds_path,
        "--logger-level".to_string(),
-        "INFO".to_string(),
+        log_level.to_string().to_uppercase(),
        "--json-output".to_string(),
    ];
@ -755,13 +775,13 @@ struct PythonLogMessage {
 impl PythonLogMessage {
    fn trace(&self) {
        match self.record.level.name {
-            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
+            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()),
-            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
+            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()),
-            PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
+            PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()),
-            PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
+            PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()),
-            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
+            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()),
-            PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
+            PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()),
-            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
+            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()),
        }
    }
 }
@ -823,26 +843,26 @@ fn find_num_shards(
    Ok(num_shard)
 }
-#[derive(Debug)]
+#[derive(Debug, Error)]
 enum LauncherError {
    #[error("Invalid argument: {0}")]
    ArgumentValidation(String),
    #[error("not enough cuda devices: {0}")]
    NotEnoughCUDADevices(String),
    #[error("Download error")]
    DownloadError,
    #[error("Shard cannot start")]
    ShardCannotStart,
    #[error("Shard disconnected")]
    ShardDisconnected,
    #[error("Shard failed")]
    ShardFailed,
    #[error("Webserver failed")]
    WebserverFailed,
    #[error("Webserver cannot start")]
    WebserverCannotStart,
 }
 impl core::fmt::Display for LauncherError {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        write!(f, "{self:?}")
    }
 }
 impl std::error::Error for LauncherError {}
 fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherError> {
    // Enter download tracing span
    let _span = tracing::span!(tracing::Level::INFO, "download").entered();
@ -981,6 +1001,7 @@ fn spawn_shards(
    args: &Args,
    cuda_graphs: Vec<usize>,
    max_total_tokens: usize,
    max_log_level: LevelFilter,
    shutdown: Arc<AtomicBool>,
    shutdown_receiver: &mpsc::Receiver<()>,
    shutdown_sender: mpsc::Sender<()>,
@ -1038,6 +1059,7 @@ fn spawn_shards(
                max_total_tokens,
                max_batch_size,
                otlp_endpoint,
                max_log_level,
                status_sender,
                shutdown,
                shutdown_sender,
@ -1278,8 +1300,22 @@ fn main() -> Result<(), LauncherError> {
    let args: Args = Args::parse();
    // Filter events with LOG_LEVEL
-    let env_filter =
+    let varname = "LOG_LEVEL";
-        EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
+    let env_filter = if let Ok(log_level) = std::env::var(varname) {
        // Override to avoid simple logs to be spammed with tokio level informations
        let log_level = match &log_level[..] {
            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
            "info" => "text_generation_launcher=info,text_generation_router=info",
            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
            log_level => log_level,
        };
        EnvFilter::builder()
            .with_default_directive(LevelFilter::INFO.into())
            .parse_lossy(log_level)
    } else {
        EnvFilter::new("info")
    };
    let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO);
    if args.json_output {
        tracing_subscriber::fmt()
@ -1322,33 +1358,30 @@ fn main() -> Result<(), LauncherError> {
        };
        let content = std::fs::read_to_string(filename)?;
-        let config: Config = serde_json::from_str(&content)?;
+        let config: RawConfig = serde_json::from_str(&content)?;
        let config: Config = config.into();
        // Quantization usually means you're even more RAM constrained.
        let max_default = 4096;
-        let max_position_embeddings = match (config.max_position_embeddings, config.max_seq_len) {
+        if let Some(max_position_embeddings) = config.max_position_embeddings {
-            (Some(max_position_embeddings), _) | (None, Some(max_position_embeddings)) => {
+            if max_position_embeddings > max_default {
-                if max_position_embeddings > max_default {
+                let max = max_position_embeddings;
-                    let max = max_position_embeddings;
+                if args.max_input_tokens.is_none()
-                    if args.max_input_tokens.is_none()
+                    && args.max_total_tokens.is_none()
-                        && args.max_total_tokens.is_none()
+                    && args.max_batch_prefill_tokens.is_none()
-                        && args.max_batch_prefill_tokens.is_none()
+                {
-                    {
+                    tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
                        tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
                    }
                    max_default
                } else {
                    max_position_embeddings
                }
                Ok(max_default)
            } else {
                Ok(max_position_embeddings)
            }
-            _ => {
+        } else {
-                return Err(Box::new(LauncherError::ArgumentValidation(
+            Err(Box::new(LauncherError::ArgumentValidation(
-                    "no max defined".to_string(),
+                "no max defined".to_string(),
-                )));
+            )))
-            }
+        }
        };
        Ok(max_position_embeddings)
    };
    let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096);
@ -1504,6 +1537,7 @@ fn main() -> Result<(), LauncherError> {
        &args,
        cuda_graphs,
        max_total_tokens,
        max_log_level,
        shutdown.clone(),
        &shutdown_receiver,
        shutdown_sender,
--- a/load_tests/Makefile
+++ b/load_tests/Makefile
@ -0,0 +1,9 @@
 ShareGPT_V3_unfiltered_cleaned_split.json:
 	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
 	python filter.py
 prepare_orca:
 	python orca.py
--- a/load_tests/filter.py
+++ b/load_tests/filter.py
@ -0,0 +1,26 @@
 import json
 def main():
    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
        data = json.load(f)
    # Select only the first 2k conversations that start with a human.
    max = 2000
    conversations = []
    for conversation in data:
        conv = conversation.get("conversations")
        if conv and conv[0]["from"] == "human":
            # Trim the rest of the output
            conversation["conversations"] = conversation["conversations"][:1]
            conversations.append(conversation)
            if len(conversation) >= max:
                break
    with open("./small.json", "w") as f:
        data = json.dump(conversations, f, indent=4)
 if __name__ == "__main__":
    main()
--- a/load_tests/orca.py
+++ b/load_tests/orca.py
@ -0,0 +1,27 @@
 import json
 import datasets
 import tqdm
 def main():
    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
    # Select only the first 2k conversations that start with a human.
    max = min(2000, len(dataset))
    conversations = []
    for item in tqdm.tqdm(dataset, total=max):
        conversation = {
            "conversations": [
                {"from": "human", "value": item["question"]},
            ],
            "id": item["id"],
        }
        conversations.append(conversation)
        if len(conversations) >= max:
            break
    with open("./small.json", "w") as f:
        data = json.dump(conversations, f, indent=4)
 if __name__ == "__main__":
    main()
--- a/load_tests/starcoder_load.js
+++ b/load_tests/starcoder_load.js
@ -1,63 +0,0 @@
 import {check} from 'k6';
 import http from 'k6/http';
 import {Trend} from 'k6/metrics';
 const host = __ENV.HOST || '127.0.0.1:3000';
 const totalTime = new Trend('total_time', true);
 const validationTime = new Trend('validation_time', true);
 const queueTime = new Trend('queue_time', true);
 const inferenceTime = new Trend('inference_time', true);
 const timePerToken = new Trend('time_per_token', true);
 const example = {
    payload: JSON.stringify({
        inputs: '# This is a fibonacci function written in the Python programming language.' +
            'def fibonacci',
        parameters: {
            details: true,
            max_new_tokens: 60,
            temperature: 0.2,
            top_p: 0.95,
            seed: 0,
        },
    }),
    generated_tokens: 60
 };
 export const options = {
    thresholds: {
        http_req_failed: ['rate==0'],
        time_per_token: ['p(95)<90'],
        queue_time: ['p(95)<1500'],
    },
    scenarios: {
        load_test: {
            executor: 'constant-arrival-rate',
            duration: '60s',
            preAllocatedVUs: 100,
            rate: 10,
            timeUnit: '1s',
        },
    },
 };
 export default function () {
    const headers = {'Content-Type': 'application/json'};
    const res = http.post(`http://${host}/generate`, example.payload, {
        headers,
    });
    check(res, {
        'Post status is 200': (r) => res.status === 200,
        'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens,
    });
    if (res.status === 200) {
        totalTime.add(res.headers["X-Total-Time"]);
        validationTime.add(res.headers["X-Validation-Time"]);
        queueTime.add(res.headers["X-Queue-Time"]);
        inferenceTime.add(res.headers["X-Inference-Time"]);
        timePerToken.add(res.headers["X-Time-Per-Token"]);
    }
 }
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -110,6 +110,7 @@ impl Client {
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
        model_id: &str
    ) -> Result<Option<u32>> {
        let warmup_enabled: bool = env::var("WARMUP_ENABLED").ok().map_or(true, |value| value.to_lowercase() == "true");
        if !warmup_enabled {
@ -152,25 +153,76 @@ impl Client {
        let mut batch_counter: u64 = 0;
        let mut request_counter: u64 = 0;
-        for shape in shapes.iter() {
+        if model_id.contains("llava") {
-            let (batch_size, seq_length) = shape;
+            let mut n_tokens = 0;
-            let mut batches: Vec<Batch> = vec![
+            let mut requests = Vec::new();
-                self.create_warmup_batch(
+            // Create requests
-                    *shape,
+            while n_tokens < max_prefill_tokens {
-                    &mut batch_counter,
+                let truncate = cmp::min(max_input_length, max_prefill_tokens - n_tokens);
-                    &mut request_counter,
+
-                    max_input_length,
+                let mut inputs = String::new();
-                    max_total_tokens,
+                inputs.push_str("![](data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=)");
-                    seq_bucket_size,
+                inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
-                    false,
+
-                    None,
+                requests.push(Request {
-                )
+                    id: 0,
-            ];
+                    // We truncate the input on the server side to be sure that it has the correct size
-            // if possible, create second batch in order to trigger concatenate operation
+                    inputs,
-            if *batch_size < max_decode_batch_size {
+                    truncate,
-                batches.push(
+                    // Set sampling parameters to also take these ops into account in the max memory
                    parameters: Some(NextTokenChooserParameters {
                        temperature: 0.9,
                        top_k: 10,
                        top_p: 0.9,
                        typical_p: 0.9,
                        do_sample: false,
                        seed: 0,
                        repetition_penalty: 1.2,
                        frequency_penalty: 0.1,
                        watermark: true,
                        grammar: String::new(),
                        grammar_type: GrammarType::None as i32,
                    }),
                    stopping_parameters: Some(StoppingCriteriaParameters {
                        max_new_tokens: max_total_tokens - truncate,
                        stop_sequences: vec![],
                        ignore_eos_token: true,
                    }),
                    prefill_logprobs: true,
                    top_n_tokens: 20,
                });
                n_tokens += max_input_length;
                // Check max_batch_size
                if Some(requests.len()) == max_batch_size {
                    break;
                }
            }
            let mut batches = Vec::new();
            batches.push(Batch {
                id: 0,
                size: requests.len() as u32,
                requests,
                max_tokens: 0,
            });
            let request = tonic::Request::new(WarmupRequest {
                batches,
                max_input_length,
                max_prefill_tokens,
                max_total_tokens,
            })
            .inject_context();
            let response = self.stub.warmup(request).await?.into_inner();
            Ok(response.max_supported_total_tokens)
        } 
        else {
            for shape in shapes.iter() {
                let (batch_size, seq_length) = shape;
                let mut batches: Vec<Batch> = vec![
                    self.create_warmup_batch(
-                        (1, *seq_length),
+                        *shape,
                        &mut batch_counter,
                        &mut request_counter,
                        max_input_length,
@ -179,56 +231,45 @@ impl Client {
                        false,
                        None,
                    )
-                );
+                ];
                // if possible, create second batch in order to trigger concatenate operation
                if *batch_size < max_decode_batch_size {
                    batches.push(
                        self.create_warmup_batch(
                            (1, *seq_length),
                            &mut batch_counter,
                            &mut request_counter,
                            max_input_length,
                            max_total_tokens,
                            seq_bucket_size,
                            false,
                            None,
                        )
                    );
                }
                let request = tonic::Request::new(WarmupRequest {
                    batches,
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                }).inject_context();
                let _response = self.stub.warmup(request).await?.into_inner();
            }
-            let request = tonic::Request::new(WarmupRequest {
+            // send batches to warmup all possible decode shapes
-                batches,
+            if decode_batch_sizes.len() > 1 {
-                max_input_length,
+                let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size {
-                max_prefill_tokens,
+                    decode_bucket_size
-                max_total_tokens,
+                } else {
-            }).inject_context();
+                    decode_bucket_size.div_ceil(max_prefill_batch_size)
-            let _response = self.stub.warmup(request).await?.into_inner();
+                };
-        }
+                let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket;
-        // send batches to warmup all possible decode shapes
+                let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size);
-        if decode_batch_sizes.len() > 1 {
+                let mut batches: Vec<Batch> = vec![
            let steps_per_bucket: u32 = if decode_bucket_size <= max_prefill_batch_size {
                decode_bucket_size
            } else {
                decode_bucket_size.div_ceil(max_prefill_batch_size)
            };
            let max_new_tokens: u32 = 2 * decode_batch_sizes.len() as u32 * steps_per_bucket;
            let mut requests_send: u32 = cmp::min(max_prefill_batch_size, decode_bucket_size);
            let mut batches: Vec<Batch> = vec![
                self.create_warmup_batch(
                    (requests_send, seq_bucket_size),
                    &mut batch_counter,
                    &mut request_counter,
                    max_input_length,
                    max_total_tokens,
                    seq_bucket_size,
                    false,
                    Some(max_new_tokens),
                )
            ];
            let get_current_decode_batch_size = |num: u32| -> u32 {
                decode_batch_sizes.iter()
                    .filter(|&&x| x >= num)
                    .min()
                    .copied()
                    .unwrap()
            };
            let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send);
            while current_decode_batch_size < max_decode_batch_size {
                let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send;
                let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size);
                batches.push(
                    self.create_warmup_batch(
-                        (num_requests, seq_bucket_size),
+                        (requests_send, seq_bucket_size),
                        &mut batch_counter,
                        &mut request_counter,
                        max_input_length,
@ -237,48 +278,74 @@ impl Client {
                        false,
                        Some(max_new_tokens),
                    )
-                );
+                ];
-                requests_send += num_requests;
+                let get_current_decode_batch_size = |num: u32| -> u32 {
-                current_decode_batch_size = get_current_decode_batch_size(requests_send);
+                    decode_batch_sizes.iter()
                        .filter(|&&x| x >= num)
                        .min()
                        .copied()
                        .unwrap()
                };
                let mut current_decode_batch_size: u32 = get_current_decode_batch_size(requests_send);
                while current_decode_batch_size < max_decode_batch_size {
                    let distance_to_next_bucket = current_decode_batch_size + decode_bucket_size - requests_send;
                    let num_requests: u32 = cmp::min(distance_to_next_bucket, max_prefill_batch_size);
                    batches.push(
                        self.create_warmup_batch(
                            (num_requests, seq_bucket_size),
                            &mut batch_counter,
                            &mut request_counter,
                            max_input_length,
                            max_total_tokens,
                            seq_bucket_size,
                            false,
                            Some(max_new_tokens),
                        )
                    );
                    requests_send += num_requests;
                    current_decode_batch_size = get_current_decode_batch_size(requests_send);
                }
                let request = tonic::Request::new(WarmupRequest {
                    batches,
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
                }).inject_context();
                let _response = self.stub.warmup(request).await?.into_inner();
            }
-            let request = tonic::Request::new(WarmupRequest {
+            // send batches with default params to warm up Greedy search
-                batches,
+            let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len());
-                max_input_length,
+            for batch_size in &prefill_batch_sizes {
-                max_prefill_tokens,
+                greedy_shapes.push((*batch_size, seq_bucket_size.clone()));
-                max_total_tokens,
+            }
-            }).inject_context();
+            for greedy_shape in greedy_shapes.iter() {
-            let _response = self.stub.warmup(request).await?.into_inner();
+                let batches: Vec<Batch> = vec![
-        }
+                    self.create_warmup_batch(
-
+                        *greedy_shape,
-        // send batches with default params to warm up Greedy search
+                        &mut batch_counter,
-        let mut greedy_shapes: Vec<(u32, u32)> = Vec::with_capacity(prefill_batch_sizes.len());
+                        &mut request_counter,
-        for batch_size in &prefill_batch_sizes {
+                        max_input_length,
-            greedy_shapes.push((*batch_size, seq_bucket_size.clone()));
+                        max_total_tokens,
-        }
+                        seq_bucket_size,
-        for greedy_shape in greedy_shapes.iter() {
+                        true,
-            let batches: Vec<Batch> = vec![
+                        None,
-                self.create_warmup_batch(
+                    )
-                    *greedy_shape,
+                ];
-                    &mut batch_counter,
+                let request = tonic::Request::new(WarmupRequest {
-                    &mut request_counter,
+                    batches,
                    max_input_length,
                    max_prefill_tokens,
                    max_total_tokens,
-                    seq_bucket_size,
+                }).inject_context();
-                    true,
+                let _response = self.stub.warmup(request).await?.into_inner();
-                    None,
+            }
-                )
+            Ok(None) // No support for maximum total tokens
            ];
            let request = tonic::Request::new(WarmupRequest {
                batches,
                max_input_length,
                max_prefill_tokens,
                max_total_tokens,
            }).inject_context();
            let _response = self.stub.warmup(request).await?.into_inner();
        }
        Ok(None) // No support for maximum total tokens
    }
    #[instrument(skip_all)]
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -100,6 +100,7 @@ impl ShardedClient {
        max_prefill_tokens: u32,
        max_total_tokens: u32,
        max_batch_size: Option<usize>,
        model_id: &str,
    ) -> Result<Option<u32>> {
        let futures: Vec<_> = self
            .clients
@ -110,6 +111,7 @@ impl ShardedClient {
                    max_prefill_tokens,
                    max_total_tokens,
                    max_batch_size,
                    model_id
                ))
            })
            .collect();
--- a/router/grpc-metadata/src/lib.rs
+++ b/router/grpc-metadata/src/lib.rs
@ -2,30 +2,9 @@
 //! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples
 use opentelemetry::global;
-use opentelemetry::propagation::{Extractor, Injector};
+use opentelemetry::propagation::Injector;
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 /// Extract context metadata from a gRPC request's metadata
 struct MetadataExtractor<'a>(pub &'a tonic::metadata::MetadataMap);
 impl<'a> Extractor for MetadataExtractor<'a> {
    /// Get a value for a key from the MetadataMap.  If the value can't be converted to &str, returns None
    fn get(&self, key: &str) -> Option<&str> {
        self.0.get(key).and_then(|metadata| metadata.to_str().ok())
    }
    /// Collect all the keys from the MetadataMap.
    fn keys(&self) -> Vec<&str> {
        self.0
            .keys()
            .map(|key| match key {
                tonic::metadata::KeyRef::Ascii(v) => v.as_str(),
                tonic::metadata::KeyRef::Binary(v) => v.as_str(),
            })
            .collect::<Vec<_>>()
    }
 }
 /// Inject context in the metadata of a gRPC request.
 struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
--- a/router/src/config.rs
+++ b/router/src/config.rs
@ -100,7 +100,6 @@ impl LlavaNext {
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
 pub struct ClipVisionModel {
    image_size: usize,
@ -108,7 +107,6 @@ pub struct ClipVisionModel {
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
 pub struct Idefics2 {}
@ -118,6 +116,24 @@ impl Idefics2 {
    }
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct PaliTextConfig {
    num_image_tokens: usize,
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct Paligemma {
    text_config: PaliTextConfig,
 }
 impl Paligemma {
    pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize {
        self.text_config.num_image_tokens
    }
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "model_type")]
 #[serde(rename_all = "snake_case")]
@ -132,12 +148,15 @@ pub enum Config {
    Santacoder,
    Bloom,
    Mpt,
    Gpt2,
    GptNeox,
    Phi,
    #[serde(rename = "phi-msft")]
    PhiMsft,
    Phi3,
    Llama,
    Baichuan,
    Paligemma(Paligemma),
    Gemma,
    Cohere,
    Drbx,
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -4,7 +4,7 @@
 use crate::validation::{Validation, ValidationError};
 use crate::{
    ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse,
-    HubTokenizerConfig, Message, PrefillToken, Queue, Token,
+    HubTokenizerConfig, Message, MessageChunk, PrefillToken, Queue, Text, TextMessage, Token,
 };
 use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools};
 use futures::future::try_join_all;
@ -373,16 +373,15 @@ impl ChatTemplate {
        if self.use_default_tool_template {
            if let Some(last_message) = messages.last_mut() {
                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
-                    last_message.content = Some(format!(
+                    last_message.content.push(MessageChunk::Text(Text {
-                        "{}\n---\n{}\n{}",
+                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
-                        last_message.content.as_deref().unwrap_or_default(),
+                    }));
                        tool_prompt,
                        tools
                    ));
                }
            }
        }
        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
        self.template
            .render(ChatTemplateInputs {
                messages,
@ -950,8 +949,7 @@ impl InferError {
 #[cfg(test)]
 mod tests {
    use crate::infer::raise_exception;
-    use crate::ChatTemplateInputs;
+    use crate::{ChatTemplateInputs, TextMessage};
    use crate::Message;
    use minijinja::Environment;
    #[test]
@ -985,29 +983,21 @@ mod tests {
        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
+                    content: "Hi!".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
+                    content: "Hello how can I help?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
+                    content: "What is Deep Learning?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
+                    content: "magic!".to_string(),
                    name: None,
                    tool_calls: None,
                },
            ],
            bos_token: Some("[BOS]"),
@ -1055,35 +1045,25 @@ mod tests {
        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
+                    content: "Hi!".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("Hi again!".to_string()),
+                    content: "Hi again!".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
+                    content: "Hello how can I help?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
+                    content: "What is Deep Learning?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
+                    content: "magic!".to_string(),
                    name: None,
                    tool_calls: None,
                },
            ],
            bos_token: Some("[BOS]"),
@ -1136,29 +1116,21 @@ mod tests {
        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
+                    content: "Hi!".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
+                    content: "Hello how can I help?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
+                    content: "What is Deep Learning?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
+                    content: "magic!".to_string(),
                    name: None,
                    tool_calls: None,
                },
            ],
            bos_token: Some("[BOS]"),
@ -1195,29 +1167,21 @@ mod tests {
        let chat_template_inputs = ChatTemplateInputs {
            messages: vec![
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("Hi!".to_string()),
+                    content: "Hi!".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("Hello how can I help?".to_string()),
+                    content: "Hello how can I help?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "user".to_string(),
-                    content: Some("What is Deep Learning?".to_string()),
+                    content: "What is Deep Learning?".to_string(),
                    name: None,
                    tool_calls: None,
                },
-                Message {
+                TextMessage {
                    role: "assistant".to_string(),
-                    content: Some("magic!".to_string()),
+                    content: "magic!".to_string(),
                    name: None,
                    tool_calls: None,
                },
            ],
            bos_token: Some("[BOS]"),
@ -1240,34 +1204,24 @@ mod tests {
    #[test]
    fn test_many_chat_templates() {
        let example_chat = vec![
-            Message {
+            TextMessage {
                role: "user".to_string(),
-                content: Some("Hello, how are you?".to_string()),
+                content: "Hello, how are you?".to_string(),
                name: None,
                tool_calls: None,
            },
-            Message {
+            TextMessage {
                role: "assistant".to_string(),
-                content: Some("I'm doing great. How can I help you today?".to_string()),
+                content: "I'm doing great. How can I help you today?".to_string(),
                name: None,
                tool_calls: None,
            },
-            Message {
+            TextMessage {
                role: "user".to_string(),
-                content: Some("I'd like to show off how chat templating works!".to_string()),
+                content: "I'd like to show off how chat templating works!".to_string(),
                name: None,
                tool_calls: None,
            },
        ];
-        let example_chat_with_system = vec![Message {
+        let example_chat_with_system = [TextMessage {
            role: "system".to_string(),
-            content: Some(
+            content: "You are a friendly chatbot who always responds in the style of a pirate"
-                "You are a friendly chatbot who always responds in the style of a pirate"
+                .to_string(),
                    .to_string(),
            ),
            name: None,
            tool_calls: None,
        }]
        .iter()
        .chain(&example_chat)
@ -1384,7 +1338,7 @@ mod tests {
        {
            let mut env = Environment::new();
            env.add_function("raise_exception", raise_exception);
-            let tmpl = env.template_from_str(&chat_template);
+            let tmpl = env.template_from_str(chat_template);
            let result = tmpl.unwrap().render(input).unwrap();
            assert_eq!(result, target);
        }
@ -1407,17 +1361,13 @@ mod tests {
                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
                input: ChatTemplateInputs {
                    messages: vec![
-                        Message {
+                        TextMessage{
                            role: "system".to_string(),
-                            content: Some("You are a friendly chatbot who always responds in the style of a pirate".to_string()),
+                            content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
                            name: None,
                            tool_calls: None,
                        },
-                        Message {
+                        TextMessage{
                            role: "user".to_string(),
-                            content: Some("How many helicopters can a human eat in one sitting?".to_string()),
+                            content: "How many helicopters can a human eat in one sitting?".to_string(),
                            name: None,
                            tool_calls: None,
                        },
                    ],
                    add_generation_prompt: true,
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -11,6 +11,7 @@ use queue::{Entry, Queue};
 use serde::{Deserialize, Serialize};
 use tokio::sync::OwnedSemaphorePermit;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;
@ -159,6 +160,8 @@ pub struct Info {
    #[schema(example = "32")]
    pub max_client_batch_size: usize,
    /// Router Info
    #[schema(example = "text-generation-router")]
    pub router: &'static str,
    #[schema(example = "0.5.0")]
    pub version: &'static str,
    #[schema(nullable = true, example = "null")]
@ -399,6 +402,11 @@ pub struct CompletionRequest {
    #[serde(default)]
    #[schema(example = "1.0")]
    pub frequency_penalty: Option<f32>,
    /// Up to 4 sequences where the API will stop generating further tokens.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stop: Option<Vec<String>>,
 }
 #[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
@ -438,7 +446,7 @@ pub(crate) struct ChatCompletion {
 #[derive(Clone, Deserialize, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionComplete {
    pub index: u32,
-    pub message: Message,
+    pub message: OutputMessage,
    pub logprobs: Option<ChatCompletionLogprobs>,
    pub finish_reason: String,
 }
@ -531,6 +539,30 @@ impl ChatCompletion {
        return_logprobs: bool,
        tool_calls: Option<Vec<ToolCall>>,
    ) -> Self {
        let message = match (output, tool_calls) {
            (Some(content), None) => OutputMessage::ChatMessage(TextMessage {
                role: "assistant".into(),
                content,
            }),
            (None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage {
                role: "assistant".to_string(),
                tool_calls,
            }),
            (Some(output), Some(_)) => {
                warn!("Received both chat and tool call");
                OutputMessage::ChatMessage(TextMessage {
                    role: "assistant".into(),
                    content: output,
                })
            }
            (None, None) => {
                warn!("Didn't receive an answer");
                OutputMessage::ChatMessage(TextMessage {
                    role: "assistant".into(),
                    content: "".to_string(),
                })
            }
        };
        Self {
            id: String::new(),
            object: "text_completion".into(),
@ -539,12 +571,7 @@ impl ChatCompletion {
            system_fingerprint,
            choices: vec![ChatCompletionComplete {
                index: 0,
-                message: Message {
+                message,
                    role: "assistant".into(),
                    content: output,
                    name: None,
                    tool_calls,
                },
                logprobs: return_logprobs
                    .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
                finish_reason: details.finish_reason.to_string(),
@ -566,7 +593,8 @@ pub(crate) struct CompletionCompleteChunk {
    pub model: String,
    pub system_fingerprint: String,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema)]
+
 #[derive(Clone, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionChunk {
    pub id: String,
    pub object: String,
@ -578,7 +606,7 @@ pub(crate) struct ChatCompletionChunk {
    pub choices: Vec<ChatCompletionChoice>,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema)]
+#[derive(Clone, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionChoice {
    pub index: u32,
    pub delta: ChatCompletionDelta,
@ -586,21 +614,21 @@ pub(crate) struct ChatCompletionChoice {
    pub finish_reason: Option<String>,
 }
-#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
-pub(crate) struct ChatCompletionDelta {
+pub struct ToolCallDelta {
-    #[schema(example = "user")]
+    #[schema(example = "assistant")]
-    // TODO Modify this to a true enum.
+    role: String,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
+    tool_calls: DeltaToolCall,
    pub role: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[schema(example = "What is Deep Learning?")]
    pub content: Option<String>,
    // default to None
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub tool_calls: Option<DeltaToolCall>,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
+#[derive(Clone, Debug, Serialize, ToSchema)]
 #[serde(untagged)]
 enum ChatCompletionDelta {
    Chat(TextMessage),
    Tool(ToolCallDelta),
 }
 #[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
 pub(crate) struct DeltaToolCall {
    pub index: u32,
    pub id: String,
@ -608,7 +636,7 @@ pub(crate) struct DeltaToolCall {
    pub function: Function,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
 pub(crate) struct Function {
    pub name: Option<String>,
    pub arguments: String,
@ -626,15 +654,13 @@ impl ChatCompletionChunk {
        finish_reason: Option<String>,
    ) -> Self {
        let delta = match (delta, tool_calls) {
-            (Some(delta), _) => ChatCompletionDelta {
+            (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
-                role: Some("assistant".to_string()),
+                role: "assistant".to_string(),
-                content: Some(delta),
+                content: delta,
-                tool_calls: None,
+            }),
-            },
+            (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
-            (None, Some(tool_calls)) => ChatCompletionDelta {
+                role: "assistant".to_string(),
-                role: Some("assistant".to_string()),
+                tool_calls: DeltaToolCall {
                content: None,
                tool_calls: Some(DeltaToolCall {
                    index: 0,
                    id: String::new(),
                    r#type: "function".to_string(),
@ -642,13 +668,12 @@ impl ChatCompletionChunk {
                        name: None,
                        arguments: tool_calls[0].to_string(),
                    },
-                }),
+                },
-            },
+            }),
-            (None, None) => ChatCompletionDelta {
+            (None, None) => ChatCompletionDelta::Chat(TextMessage {
-                role: None,
+                role: "assistant".to_string(),
-                content: None,
+                content: "".to_string(),
-                tool_calls: None,
+            }),
            },
        };
        Self {
            id: String::new(),
@ -849,7 +874,7 @@ where
    state.end()
 }
-#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default)]
+#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)]
 pub(crate) struct FunctionDefinition {
    #[serde(default)]
    pub description: Option<String>,
@ -869,7 +894,7 @@ pub(crate) struct Tool {
 #[derive(Clone, Serialize, Deserialize, Default)]
 pub(crate) struct ChatTemplateInputs<'a> {
-    messages: Vec<Message>,
+    messages: Vec<TextMessage>,
    bos_token: Option<&'a str>,
    eos_token: Option<&'a str>,
    add_generation_prompt: bool,
@ -877,88 +902,113 @@ pub(crate) struct ChatTemplateInputs<'a> {
    tools_prompt: Option<&'a str>,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
+#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
 pub(crate) struct ToolCall {
-    pub id: u32,
+    pub id: String,
    pub r#type: String,
    pub function: FunctionDefinition,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
-pub(crate) struct Text {
+struct Url {
-    #[serde(default)]
+    url: String,
    pub text: String,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
-pub(crate) struct ImageUrl {
+struct ImageUrl {
-    #[serde(default)]
+    image_url: Url,
    pub url: String,
 }
-#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
-pub(crate) struct Content {
+struct Text {
-    pub r#type: String,
+    text: String,
 }
 #[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
 #[serde(tag = "type")]
 #[serde(rename_all = "snake_case")]
 enum MessageChunk {
    Text(Text),
    ImageUrl(ImageUrl),
 }
 #[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
 pub struct Message {
    #[schema(example = "user")]
    role: String,
    #[schema(example = "My name is David and I")]
    #[serde(deserialize_with = "message_content_serde::deserialize")]
    content: Vec<MessageChunk>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub text: Option<String>,
+    #[schema(example = "\"David\"")]
-    #[serde(default, skip_serializing_if = "Option::is_none")]
+    name: Option<String>,
    pub image_url: Option<ImageUrl>,
 }
 mod message_content_serde {
    use super::*;
-    use serde::de;
+    use serde::{Deserialize, Deserializer};
    use serde::Deserializer;
    use serde_json::Value;
-    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
+    pub fn deserialize<'de, D>(deserializer: D) -> Result<Vec<MessageChunk>, D::Error>
    where
        D: Deserializer<'de>,
    {
-        let value = Value::deserialize(deserializer)?;
+        #[derive(Deserialize)]
-        match value {
+        #[serde(untagged)]
-            Value::String(s) => Ok(Some(s)),
+        enum Message {
-            Value::Array(arr) => {
+            Text(String),
-                let results: Result<Vec<String>, _> = arr
+            Chunks(Vec<MessageChunk>),
-                    .into_iter()
+        }
-                    .map(|v| {
+        let message: Message = Deserialize::deserialize(deserializer)?;
-                        let content: Content =
+        let chunks = match message {
-                            serde_json::from_value(v).map_err(de::Error::custom)?;
+            Message::Text(text) => {
-                        match content.r#type.as_str() {
+                vec![MessageChunk::Text(Text { text })]
                            "text" => Ok(content.text.unwrap_or_default()),
                            "image_url" => {
                                if let Some(url) = content.image_url {
                                    Ok(format!("![]({})", url.url))
                                } else {
                                    Ok(String::new())
                                }
                            }
                            _ => Err(de::Error::custom("invalid content type")),
                        }
                    })
                    .collect();
                results.map(|strings| Some(strings.join("")))
            }
-            Value::Null => Ok(None),
+            Message::Chunks(s) => s,
-            _ => Err(de::Error::custom("invalid token format")),
+        };
        Ok(chunks)
    }
 }
 #[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
 pub struct TextMessage {
    #[schema(example = "user")]
    pub role: String,
    #[schema(example = "My name is David and I")]
    pub content: String,
 }
 impl From<Message> for TextMessage {
    fn from(value: Message) -> Self {
        TextMessage {
            role: value.role,
            content: value
                .content
                .into_iter()
                .map(|c| match c {
                    MessageChunk::Text(Text { text }) => text,
                    MessageChunk::ImageUrl(image) => {
                        let url = image.image_url.url;
                        format!("![]({url})")
                    }
                })
                .collect::<Vec<_>>()
                .join(""),
        }
    }
 }
-#[derive(Clone, Deserialize, ToSchema, Serialize, Debug)]
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
-pub(crate) struct Message {
+pub struct ToolCallMessage {
-    #[schema(example = "user")]
+    #[schema(example = "assistant")]
-    pub role: String,
+    role: String,
-    #[serde(skip_serializing_if = "Option::is_none")]
+    tool_calls: Vec<ToolCall>,
-    #[schema(example = "My name is David and I")]
+}
-    #[serde(deserialize_with = "message_content_serde::deserialize")]
+
-    pub content: Option<String>,
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
-    #[serde(default, skip_serializing_if = "Option::is_none")]
+#[serde(untagged)]
-    #[schema(example = "\"David\"")]
+pub(crate) enum OutputMessage {
-    pub name: Option<String>,
+    ChatMessage(TextMessage),
-    #[serde(default, skip_serializing_if = "Option::is_none")]
+    ToolCall(ToolCallMessage),
    pub tool_calls: Option<Vec<ToolCall>>,
 }
 #[derive(Clone, Debug, Deserialize, ToSchema)]
@ -1121,7 +1171,7 @@ pub(crate) struct ErrorResponse {
 #[cfg(test)]
 mod tests {
    use super::*;
-
+    use serde_json::json;
    use tokenizers::Tokenizer;
    pub(crate) async fn get_tokenizer() -> Tokenizer {
@ -1189,4 +1239,100 @@ mod tests {
        );
        assert_eq!(config.eos_token, Some("<｜end▁of▁sentence｜>".to_string()));
    }
    #[test]
    fn test_chat_simple_string() {
        let json = json!({
            "model": "",
            "messages": [{
                "role": "user",
                "content": "What is Deep Learning?"
            }]
        });
        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
        assert_eq!(
            request.messages[0],
            Message {
                role: "user".to_string(),
                content: vec![MessageChunk::Text(Text {
                    text: "What is Deep Learning?".to_string()
                }),],
                name: None
            }
        );
    }
    #[test]
    fn test_chat_request() {
        let json = json!({
            "model": "",
            "messages": [{
                "role": "user",
                "content": [
                    {"type": "text", "text": "Whats in this image?"},
                    {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"}},
                ]
            }]
        });
        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
        assert_eq!(
            request.messages[0],
            Message{
                role: "user".to_string(),
                content: vec![
                    MessageChunk::Text(Text { text: "Whats in this image?".to_string() }),
                    MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } })
                ],
                name: None
            }
        );
    }
    #[test]
    fn text_message_convert() {
        let message = Message{
                role: "user".to_string(),
                content: vec![
                    MessageChunk::Text(Text { text: "Whats in this image?".to_string() }),
                    MessageChunk::ImageUrl(ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } })
                ],
                name: None
            };
        let textmsg: TextMessage = message.into();
        assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)");
    }
    #[test]
    fn openai_output() {
        let message = OutputMessage::ChatMessage(TextMessage {
            role: "assistant".to_string(),
            content: "This is the answer".to_string(),
        });
        let serialized = serde_json::to_string(&message).unwrap();
        assert_eq!(
            serialized,
            r#"{"role":"assistant","content":"This is the answer"}"#
        );
        let message = OutputMessage::ToolCall(ToolCallMessage {
            role: "assistant".to_string(),
            tool_calls: vec![ToolCall {
                id: "0".to_string(),
                r#type: "function".to_string(),
                function: FunctionDefinition {
                    description: None,
                    name: "myfn".to_string(),
                    arguments: json!({
                        "format": "csv"
                    }),
                },
            }],
        });
        let serialized = serde_json::to_string(&message).unwrap();
        assert_eq!(
            serialized,
            r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"#
        );
    }
 }
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -23,7 +23,7 @@ use tokenizers::Tokenizer;
 use tower_http::cors::AllowOrigin;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
-use tracing_subscriber::{EnvFilter, Layer};
+use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
 /// App Configuration
 #[derive(Parser, Debug)]
@ -349,6 +349,7 @@ async fn main() -> Result<(), RouterError> {
            max_batch_prefill_tokens,
            max_total_tokens as u32,
            max_batch_size,
            &model_info.model_id
        )
        .await
        .map_err(RouterError::Warmup)?
@ -482,8 +483,21 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
    }
    // Filter events with LOG_LEVEL
-    let env_filter =
+    let varname = "LOG_LEVEL";
-        EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
+    let env_filter = if let Ok(log_level) = std::env::var(varname) {
        // Override to avoid simple logs to be spammed with tokio level informations
        let log_level = match &log_level[..] {
            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
            "info" => "text_generation_launcher=info,text_generation_router=info",
            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
            log_level => log_level,
        };
        EnvFilter::builder()
            .with_default_directive(LevelFilter::INFO.into())
            .parse_lossy(log_level)
    } else {
        EnvFilter::new("info")
    };
    tracing_subscriber::registry()
        .with(env_filter)
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -599,9 +599,22 @@ async fn completions(
    let span = tracing::Span::current();
    metrics::increment_counter!("tgi_request_count");
-    let stream = req.stream;
+    let CompletionRequest {
-    let max_new_tokens = req.max_tokens.or(Some(100));
+        max_tokens,
-    let seed = req.seed;
+        seed,
        stop,
        stream,
        temperature,
        ..
    } = req;
    let max_new_tokens = max_tokens.or(Some(100));
    let stop = stop.unwrap_or_default();
    // enable greedy only when temperature is 0
    let (do_sample, temperature) = match temperature {
        Some(temperature) if temperature == 0.0 => (false, None),
        other => (true, other),
    };
    // if suffix is present throw an error
    if req.suffix.is_some() {
@ -637,16 +650,16 @@ async fn completions(
            inputs: prompt.to_string(),
            parameters: GenerateParameters {
                best_of: None,
-                temperature: req.temperature,
+                temperature,
                repetition_penalty: req.repetition_penalty,
                frequency_penalty: req.frequency_penalty,
                top_k: None,
                top_p: req.top_p,
                typical_p: None,
-                do_sample: true,
+                do_sample,
                max_new_tokens,
                return_full_text: None,
-                stop: Vec::new(),
+                stop: stop.clone(),
                truncate: None,
                watermark: false,
                details: true,
@ -698,7 +711,7 @@ async fn completions(
                            model: model_id.clone(),
                            system_fingerprint: system_fingerprint.clone(),
                        })
-                        .map_or_else(|_e| Event::default(), |data| data)
+                        .unwrap_or_else(|_e| Event::default())
                };
                let (header_tx, header_rx) = oneshot::channel();
@ -990,7 +1003,6 @@ async fn chat_completions(
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let span = tracing::Span::current();
    metrics::increment_counter!("tgi_request_count");
    let ChatRequest {
        logprobs,
        max_tokens,
@ -1124,13 +1136,10 @@ async fn chat_completions(
                    logprobs,
                    stream_token.details.map(|d| d.finish_reason.to_string()),
                ))
-                .map_or_else(
+                .unwrap_or_else(|e| {
-                    |e| {
+                    println!("Failed to serialize ChatCompletionChunk: {:?}", e);
-                        println!("Failed to serialize ChatCompletionChunk: {:?}", e);
+                    Event::default()
-                        Event::default()
+                })
                    },
                    |data| data,
                )
        };
        let (headers, response_stream) = generate_stream_internal(
@ -1165,7 +1174,7 @@ async fn chat_completions(
                    )
                })?;
            let tool_calls = vec![ToolCall {
-                id: 0,
+                id: "0".to_string(),
                r#type: "function".to_string(),
                function: FunctionDefinition {
                    description: None,
@ -1568,6 +1577,7 @@ pub async fn run(
        max_batch_size,
        validation_workers,
        max_client_batch_size,
        router: env!("CARGO_PKG_NAME"),
        version: env!("CARGO_PKG_VERSION"),
        sha: option_env!("VERGEN_GIT_SHA"),
        docker_label: option_env!("DOCKER_LABEL"),
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -565,6 +565,30 @@ fn prepare_input(
            inputs = modified_inputs;
            tokenizer_query
        }
        Some(Config::Paligemma(config)) => {
            let mut modified_inputs = String::with_capacity(inputs.len());
            let mut tokenizer_query = String::with_capacity(inputs.len());
            let mut start = 0;
            for chunk in RE.find_iter(&inputs) {
                let chunk_start = chunk.start();
                let chunk_end = chunk.end();
                if chunk_start != start {
                    modified_inputs.push_str(&inputs[start..chunk_start]);
                    tokenizer_query.push_str(&inputs[start..chunk_start]);
                }
                let (image_uri, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
                let slots = config.get_number_of_features(height, width);
                tokenizer_query.push_str(&"<image>".repeat(slots));
                modified_inputs.push_str(&image_uri);
                start = chunk_end;
            }
            if start != inputs.len() - 1 {
                modified_inputs.push_str(&inputs[start..]);
                tokenizer_query.push_str(&inputs[start..]);
            }
            inputs = modified_inputs;
            tokenizer_query
        }
        Some(Config::Idefics2(config)) => {
            let mut modified_inputs = String::with_capacity(inputs.len());
            let mut tokenizer_query = String::with_capacity(inputs.len());
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@ -1,6 +1,5 @@
 [toolchain]
-# Released on: 28 December, 2023
+# Released on: 02 May, 2024
-# Branched from master on: 10 November, 2023
+# https://releases.rs/docs/1.78.0/
-# https://releases.rs/docs/1.75.0/
+channel = "1.78.0"
 channel = "1.75.0"
 components = ["rustfmt", "clippy"]
--- a/server/Makefile-flash-att-v2
+++ b/server/Makefile-flash-att-v2
@ -1,5 +1,5 @@
 flash_att_v2_commit_cuda := 23e8fa5a263d1c7122bc46a86ef32030ee7130f9
-flash_att_v2_commit_rocm := 8736558c287ff2ef28b24878e42828c595ac3e69
+flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6
 flash-attention-v2-cuda:
@ -18,12 +18,12 @@ install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
 flash-attention-v2-rocm:
  # Clone flash attention
 	pip install -U packaging ninja  --no-cache-dir
-	git clone https://github.com/fxmarty/flash-attention-rocm flash-attention-v2
+	git clone https://github.com/ROCm/flash-attention.git flash-attention-v2
 build-flash-attention-v2-rocm: flash-attention-v2-rocm
 	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm)
 	cd flash-attention-v2 && git submodule update --init --recursive
-	cd flash-attention-v2 && PYTORCH_ROCM_ARCH=gfx90a python setup.py build
+	cd flash-attention-v2 && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
 install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
 	cd flash-attention-v2 && git submodule update --init --recursive && python setup.py install
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@ -14,11 +14,11 @@ install-vllm-cuda: build-vllm-cuda
 vllm-rocm:
    # Clone vllm
 	pip install -U ninja packaging --no-cache-dir
-	git clone https://github.com/fxmarty/vllm-public.git vllm
+	git clone https://github.com/fxmarty/rocm-vllm.git vllm
 build-vllm-rocm: vllm-rocm
-	cd vllm && git fetch && git checkout ad9b7c4095ef54419a0533d254f2ad84bd2dfcae
+	cd vllm && git fetch && git checkout ca6913b3c2ffacdcb7d15e914dc34adbc6c89479
-	cd vllm && python setup.py build
+	cd vllm && PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
 install-vllm-rocm: build-vllm-rocm
 	pip uninstall vllm -y || true
--- a/server/exllama_kernels/exllama_kernels/hip_compat.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_compat.cuh
@ -10,8 +10,9 @@ __device__ __forceinline__ __half __compat_hrcp(__half x) {
 }
 __device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
-    return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
+    return _Float16_2{
-        static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
+        _Float16_2{static_cast<_Float16>(1.0f),
            static_cast<_Float16>(1.0f)} / x.data};
 }
 #define hrcp __compat_hrcp
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 [[package]]
 name = "accelerate"
@ -194,13 +194,13 @@ files = [
 [[package]]
 name = "certifi"
-version = "2024.6.2"
+version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"},
+    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
-    {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
+    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
 ]
 [[package]]
@ -474,13 +474,13 @@ files = [
 [[package]]
 name = "exceptiongroup"
-version = "1.2.1"
+version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
-    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
 ]
 [package.extras]
@ -628,17 +628,17 @@ tqdm = ["tqdm"]
 [[package]]
 name = "googleapis-common-protos"
-version = "1.63.1"
+version = "1.63.2"
 description = "Common protobufs used in Google APIs"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"},
+    {file = "googleapis-common-protos-1.63.2.tar.gz", hash = "sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87"},
-    {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"},
+    {file = "googleapis_common_protos-1.63.2-py2.py3-none-any.whl", hash = "sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945"},
 ]
 [package.dependencies]
-protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+protobuf = ">=3.20.2,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
 [package.extras]
 grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
@ -942,13 +942,13 @@ files = [
 [[package]]
 name = "importlib-metadata"
-version = "7.2.1"
+version = "8.0.0"
 description = "Read metadata from Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "importlib_metadata-7.2.1-py3-none-any.whl", hash = "sha256:ffef94b0b66046dd8ea2d619b701fe978d9264d38f3998bc4c27ec3b146a87c8"},
+    {file = "importlib_metadata-8.0.0-py3-none-any.whl", hash = "sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f"},
-    {file = "importlib_metadata-7.2.1.tar.gz", hash = "sha256:509ecb2ab77071db5137c655e24ceb3eee66e7bbc6574165d0d114d9fc4bbe68"},
+    {file = "importlib_metadata-8.0.0.tar.gz", hash = "sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812"},
 ]
 [package.dependencies]
@ -1025,13 +1025,13 @@ files = [
 [[package]]
 name = "jsonschema"
-version = "4.22.0"
+version = "4.23.0"
 description = "An implementation of JSON Schema validation for Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"},
+    {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"},
-    {file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"},
+    {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"},
 ]
 [package.dependencies]
@ -1042,7 +1042,7 @@ rpds-py = ">=0.7.1"
 [package.extras]
 format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
-format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"]
 [[package]]
 name = "jsonschema-specifications"
@ -1580,13 +1580,13 @@ files = [
 [[package]]
 name = "nvidia-nvjitlink-cu12"
-version = "12.5.40"
+version = "12.5.82"
 description = "Nvidia JIT LTO Library"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"},
+    {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
-    {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"},
+    {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
 ]
 [[package]]
@ -1756,24 +1756,24 @@ files = [
 [[package]]
 name = "optimum"
-version = "1.20.0"
+version = "1.21.2"
 description = "Optimum Library is an extension of the Hugging Face Transformers library, providing a framework to integrate third-party libraries from Hardware Partners and interface with their specific functionality."
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "optimum-1.20.0-py3-none-any.whl", hash = "sha256:0c0d0746043c95e22cf3586946d7408d353f10c0486f1c7d2d11084a5cfc0ede"},
+    {file = "optimum-1.21.2-py3-none-any.whl", hash = "sha256:8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8"},
-    {file = "optimum-1.20.0.tar.gz", hash = "sha256:b64c7536fe738db9b56605105efe72006401ad2aa00cb499ae407f2e06f3043b"},
+    {file = "optimum-1.21.2.tar.gz", hash = "sha256:037e65d265237809fac69e9003215c60cf6de56e97c62ff7565abab4a94a64ce"},
 ]
 [package.dependencies]
 coloredlogs = "*"
 datasets = "*"
 huggingface-hub = ">=0.8.0"
-numpy = "*"
+numpy = "<2.0"
 packaging = "*"
 sympy = "*"
 torch = ">=1.11"
-transformers = {version = ">=4.26.0,<4.42.0", extras = ["sentencepiece"]}
+transformers = {version = ">=4.26.0,<4.43.0", extras = ["sentencepiece"]}
 [package.extras]
 amd = ["optimum-amd"]
@ -1786,15 +1786,16 @@ exporters-gpu = ["onnx", "onnxruntime-gpu", "timm"]
 exporters-tf = ["h5py", "numpy (<1.24.0)", "onnx", "onnxruntime", "tensorflow (>=2.4,<=2.12.1)", "tf2onnx", "timm", "transformers[sentencepiece] (>=4.26.0,<4.38.0)"]
 furiosa = ["optimum-furiosa"]
 graphcore = ["optimum-graphcore"]
-habana = ["optimum-habana", "transformers (>=4.38.0,<4.39.0)"]
+habana = ["optimum-habana", "transformers (>=4.40.0,<4.41.0)"]
-intel = ["optimum-intel (>=1.16.0)"]
+intel = ["optimum-intel (>=1.18.0)"]
-neural-compressor = ["optimum-intel[neural-compressor] (>=1.16.0)"]
+ipex = ["optimum-intel[ipex] (>=1.18.0)"]
 neural-compressor = ["optimum-intel[neural-compressor] (>=1.18.0)"]
 neuron = ["optimum-neuron[neuron] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
 neuronx = ["optimum-neuron[neuronx] (>=0.0.20)", "transformers (>=4.36.2,<4.42.0)"]
-nncf = ["optimum-intel[nncf] (>=1.16.0)"]
+nncf = ["optimum-intel[nncf] (>=1.18.0)"]
 onnxruntime = ["datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime (>=1.11.0)", "protobuf (>=3.20.1)"]
 onnxruntime-gpu = ["accelerate", "datasets (>=1.2.1)", "evaluate", "onnx", "onnxruntime-gpu (>=1.11.0)", "protobuf (>=3.20.1)"]
-openvino = ["optimum-intel[openvino] (>=1.16.0)"]
+openvino = ["optimum-intel[openvino] (>=1.18.0)"]
 quality = ["black (>=23.1,<24.0)", "ruff (==0.1.5)"]
 tests = ["Pillow", "accelerate", "diffusers (>=0.17.0)", "einops", "invisible-watermark", "parameterized", "pytest (<=8.0.0)", "pytest-xdist", "requests", "rjieba", "sacremoses", "scikit-learn", "timm", "torchaudio", "torchvision"]
@ -1970,84 +1971,95 @@ test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameter
 [[package]]
 name = "pillow"
-version = "10.3.0"
+version = "10.4.0"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"},
+    {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"},
-    {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"},
+    {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"},
-    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"},
+    {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"},
-    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"},
+    {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"},
-    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"},
+    {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"},
-    {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"},
+    {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"},
-    {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"},
+    {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"},
-    {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"},
+    {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"},
-    {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"},
+    {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"},
-    {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"},
+    {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"},
-    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"},
+    {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"},
-    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"},
+    {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"},
-    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"},
+    {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"},
-    {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"},
+    {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"},
-    {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"},
+    {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"},
-    {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"},
+    {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"},
-    {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"},
+    {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"},
-    {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"},
+    {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"},
-    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"},
+    {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"},
-    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"},
+    {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"},
-    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"},
+    {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"},
-    {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"},
+    {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"},
-    {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"},
+    {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"},
-    {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"},
+    {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"},
-    {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"},
+    {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"},
-    {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"},
+    {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"},
-    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"},
+    {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"},
-    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"},
+    {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"},
-    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"},
+    {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"},
-    {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"},
+    {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"},
-    {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"},
+    {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"},
-    {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"},
+    {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"},
-    {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"},
+    {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"},
+    {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"},
-    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"},
-    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"},
+    {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"},
-    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"},
+    {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"},
-    {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"},
+    {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"},
-    {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"},
+    {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"},
-    {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"},
+    {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"},
+    {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"},
+    {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"},
+    {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"},
-    {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"},
+    {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"},
+    {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"},
+    {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"},
+    {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"},
+    {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"},
-    {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"},
-    {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"},
+    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"},
    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"},
    {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"},
    {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"},
    {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"},
    {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"},
 ]
 [package.extras]
-docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
+docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
 fpx = ["olefile"]
 mic = ["olefile"]
 tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
@ -2156,52 +2168,42 @@ files = [
 [[package]]
 name = "pyarrow"
-version = "16.1.0"
+version = "17.0.0"
 description = "Python library for Apache Arrow"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
-    {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
+    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
-    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
+    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
-    {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
+    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
-    {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
+    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
-    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
+    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
-    {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
+    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
-    {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
+    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
-    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
+    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
-    {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
+    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
-    {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
+    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
    {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
    {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
    {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
    {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
    {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
 ]
 [package.dependencies]
 numpy = ">=1.16.6"
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
 [[package]]
 name = "pyarrow-hotfix"
 version = "0.6"
@ -2215,109 +2217,119 @@ files = [
 [[package]]
 name = "pydantic"
-version = "2.7.4"
+version = "2.8.2"
 description = "Data validation using Python type hints"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"},
+    {file = "pydantic-2.8.2-py3-none-any.whl", hash = "sha256:73ee9fddd406dc318b885c7a2eab8a6472b68b8fb5ba8150949fc3db939f23c8"},
-    {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"},
+    {file = "pydantic-2.8.2.tar.gz", hash = "sha256:6f62c13d067b0755ad1c21a34bdd06c0c12625a22b0fc09c6b149816604f7c2a"},
 ]
 [package.dependencies]
 annotated-types = ">=0.4.0"
-pydantic-core = "2.18.4"
+pydantic-core = "2.20.1"
-typing-extensions = ">=4.6.1"
+typing-extensions = {version = ">=4.6.1", markers = "python_version < \"3.13\""}
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
 [[package]]
 name = "pydantic-core"
-version = "2.18.4"
+version = "2.20.1"
 description = "Core functionality for Pydantic validation and serialization"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3acae97ffd19bf091c72df4d726d552c473f3576409b2a7ca36b2f535ffff4a3"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:41f4c96227a67a013e7de5ff8f20fb496ce573893b7f4f2707d065907bffdbd6"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f239eb799a2081495ea659d8d4a43a8f42cd1fe9ff2e7e436295c38a10c286a"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53e431da3fc53360db73eedf6f7124d1076e1b4ee4276b36fb25514544ceb4a3"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1f62b2413c3a0e846c3b838b2ecd6c7a19ec6793b2a522745b0869e37ab5bc1"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d41e6daee2813ecceea8eda38062d69e280b39df793f5a942fa515b8ed67953"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d482efec8b7dc6bfaedc0f166b2ce349df0011f5d2f1f25537ced4cfc34fd98"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e93e1a4b4b33daed65d781a57a522ff153dcf748dee70b40c7258c5861e1768a"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7c4ea22b6739b162c9ecaaa41d718dfad48a244909fe7ef4b54c0b530effc5a"},
-    {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"},
+    {file = "pydantic_core-2.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4f2790949cf385d985a31984907fecb3896999329103df4e4983a4a41e13e840"},
-    {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"},
+    {file = "pydantic_core-2.20.1-cp310-none-win32.whl", hash = "sha256:5e999ba8dd90e93d57410c5e67ebb67ffcaadcea0ad973240fdfd3a135506250"},
-    {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"},
+    {file = "pydantic_core-2.20.1-cp310-none-win_amd64.whl", hash = "sha256:512ecfbefef6dac7bc5eaaf46177b2de58cdf7acac8793fe033b24ece0b9566c"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d2a8fa9d6d6f891f3deec72f5cc668e6f66b188ab14bb1ab52422fe8e644f312"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:175873691124f3d0da55aeea1d90660a6ea7a3cfea137c38afa0a5ffabe37b88"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37eee5b638f0e0dcd18d21f59b679686bbd18917b87db0193ae36f9c23c355fc"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:25e9185e2d06c16ee438ed39bf62935ec436474a6ac4f9358524220f1b236e43"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:150906b40ff188a3260cbee25380e7494ee85048584998c1e66df0c7a11c17a6"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ad4aeb3e9a97286573c03df758fc7627aecdd02f1da04516a86dc159bf70121"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3f3ed29cd9f978c604708511a1f9c2fdcb6c38b9aae36a51905b8811ee5cbf1"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0dae11d8f5ded51699c74d9548dcc5938e0804cc8298ec0aa0da95c21fff57b"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faa6b09ee09433b87992fb5a2859efd1c264ddc37280d2dd5db502126d0e7f27"},
-    {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"},
+    {file = "pydantic_core-2.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9dc1b507c12eb0481d071f3c1808f0529ad41dc415d0ca11f7ebfc666e66a18b"},
-    {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"},
+    {file = "pydantic_core-2.20.1-cp311-none-win32.whl", hash = "sha256:fa2fddcb7107e0d1808086ca306dcade7df60a13a6c347a7acf1ec139aa6789a"},
-    {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"},
+    {file = "pydantic_core-2.20.1-cp311-none-win_amd64.whl", hash = "sha256:40a783fb7ee353c50bd3853e626f15677ea527ae556429453685ae32280c19c2"},
-    {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:595ba5be69b35777474fa07f80fc260ea71255656191adb22a8c53aba4479231"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a4f55095ad087474999ee28d3398bae183a66be4823f753cd7d67dd0153427c9"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9aa05d09ecf4c75157197f27cdc9cfaeb7c5f15021c6373932bf3e124af029f"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e97fdf088d4b31ff4ba35db26d9cc472ac7ef4a2ff2badeabf8d727b3377fc52"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc633a9fe1eb87e250b5c57d389cf28998e4292336926b0b6cdaee353f89a237"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d573faf8eb7e6b1cbbcb4f5b247c60ca8be39fe2c674495df0eb4318303137fe"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26dc97754b57d2fd00ac2b24dfa341abffc380b823211994c4efac7f13b9e90e"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:33499e85e739a4b60c9dac710c20a08dc73cb3240c9a0e22325e671b27b70d24"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bebb4d6715c814597f85297c332297c6ce81e29436125ca59d1159b07f423eb1"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"},
+    {file = "pydantic_core-2.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:516d9227919612425c8ef1c9b869bbbee249bc91912c8aaffb66116c0b447ebd"},
-    {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"},
+    {file = "pydantic_core-2.20.1-cp312-none-win32.whl", hash = "sha256:469f29f9093c9d834432034d33f5fe45699e664f12a13bf38c04967ce233d688"},
-    {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"},
+    {file = "pydantic_core-2.20.1-cp312-none-win_amd64.whl", hash = "sha256:035ede2e16da7281041f0e626459bcae33ed998cca6a0a007a5ebb73414ac72d"},
-    {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:0827505a5c87e8aa285dc31e9ec7f4a17c81a813d45f70b1d9164e03a813a686"},
-    {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:19c0fa39fa154e7e0b7f82f88ef85faa2a4c23cc65aae2f5aea625e3c13c735a"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa223cd1e36b642092c326d694d8bf59b71ddddc94cdb752bbbb1c5c91d833b"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c336a6d235522a62fef872c6295a42ecb0c4e1d0f1a3e500fe949415761b8a19"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7eb6a0587eded33aeefea9f916899d42b1799b7b14b8f8ff2753c0ac1741edac"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70c8daf4faca8da5a6d655f9af86faf6ec2e1768f4b8b9d0226c02f3d6209703"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9fa4c9bf273ca41f940bceb86922a7667cd5bf90e95dbb157cbb8441008482c"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:11b71d67b4725e7e2a9f6e9c0ac1239bbc0c48cce3dc59f98635efc57d6dac83"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:270755f15174fb983890c49881e93f8f1b80f0b5e3a3cc1394a255706cabd203"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"},
+    {file = "pydantic_core-2.20.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c81131869240e3e568916ef4c307f8b99583efaa60a8112ef27a366eefba8ef0"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"},
+    {file = "pydantic_core-2.20.1-cp313-none-win32.whl", hash = "sha256:b91ced227c41aa29c672814f50dbb05ec93536abf8f43cd14ec9521ea09afe4e"},
-    {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"},
+    {file = "pydantic_core-2.20.1-cp313-none-win_amd64.whl", hash = "sha256:65db0f2eefcaad1a3950f498aabb4875c8890438bc80b19362cf633b87a8ab20"},
-    {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4745f4ac52cc6686390c40eaa01d48b18997cb130833154801a442323cc78f91"},
-    {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8ad4c766d3f33ba8fd692f9aa297c9058970530a32c728a2c4bfd2616d3358b"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41e81317dd6a0127cabce83c0c9c3fbecceae981c8391e6f1dec88a77c8a569a"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04024d270cf63f586ad41fff13fde4311c4fc13ea74676962c876d9577bcc78f"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaad4ff2de1c3823fddf82f41121bdf453d922e9a238642b1dedb33c4e4f98ad"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26ab812fa0c845df815e506be30337e2df27e88399b985d0bb4e3ecfe72df31c"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c5ebac750d9d5f2706654c638c041635c385596caf68f81342011ddfa1e5598"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2aafc5a503855ea5885559eae883978c9b6d8c8993d67766ee73d82e841300dd"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4868f6bd7c9d98904b748a2653031fc9c2f85b6237009d475b1008bfaeb0a5aa"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"},
+    {file = "pydantic_core-2.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa2f457b4af386254372dfa78a2eda2563680d982422641a85f271c859df1987"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"},
+    {file = "pydantic_core-2.20.1-cp38-none-win32.whl", hash = "sha256:225b67a1f6d602de0ce7f6c1c3ae89a4aa25d3de9be857999e9124f15dab486a"},
-    {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"},
+    {file = "pydantic_core-2.20.1-cp38-none-win_amd64.whl", hash = "sha256:6b507132dcfc0dea440cce23ee2182c0ce7aba7054576efc65634f080dbe9434"},
-    {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:b03f7941783b4c4a26051846dea594628b38f6940a2fdc0df00b221aed39314c"},
-    {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1eedfeb6089ed3fad42e81a67755846ad4dcc14d73698c120a82e4ccf0f1f9f6"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:635fee4e041ab9c479e31edda27fcf966ea9614fff1317e280d99eb3e5ab6fe2"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:77bf3ac639c1ff567ae3b47f8d4cc3dc20f9966a2a6dd2311dcc055d3d04fb8a"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ed1b0132f24beeec5a78b67d9388656d03e6a7c837394f99257e2d55b461611"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6514f963b023aeee506678a1cf821fe31159b925c4b76fe2afa94cc70b3222b"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10d4204d8ca33146e761c79f83cc861df20e7ae9f6487ca290a97702daf56006"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d036c7187b9422ae5b262badb87a20a49eb6c5238b2004e96d4da1231badef1"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ebfef07dbe1d93efb94b4700f2d278494e9162565a54f124c404a5656d7ff09"},
-    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"},
+    {file = "pydantic_core-2.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6b9d9bb600328a1ce523ab4f454859e9d439150abb0906c5a1983c146580ebab"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"},
+    {file = "pydantic_core-2.20.1-cp39-none-win32.whl", hash = "sha256:784c1214cb6dd1e3b15dd8b91b9a53852aed16671cc3fbe4786f4f1db07089e2"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"},
+    {file = "pydantic_core-2.20.1-cp39-none-win_amd64.whl", hash = "sha256:d2fe69c5434391727efa54b47a1e7986bb0186e72a41b203df8f5b0a19a4f669"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a45f84b09ac9c3d35dfcf6a27fd0634d30d183205230a0ebe8373a0e8cfa0906"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d02a72df14dfdbaf228424573a07af10637bd490f0901cee872c4f434a735b94"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b27e6af28f07e2f195552b37d7d66b150adbaa39a6d327766ffd695799780f"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:084659fac3c83fd674596612aeff6041a18402f1e1bc19ca39e417d554468482"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:242b8feb3c493ab78be289c034a1f659e8826e2233786e36f2893a950a719bb6"},
-    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:38cf1c40a921d05c5edc61a785c0ddb4bed67827069f535d794ce6bcded919fc"},
-    {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"},
+    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e0bbdd76ce9aa5d4209d65f2b27fc6e5ef1312ae6c5333c26db3f5ade53a1e99"},
    {file = "pydantic_core-2.20.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:254ec27fdb5b1ee60684f91683be95e5133c994cc54e86a0b0963afa25c8f8a6"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:407653af5617f0757261ae249d3fba09504d7a71ab36ac057c938572d1bc9331"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:c693e916709c2465b02ca0ad7b387c4f8423d1db7b4649c551f27a529181c5ad"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b5ff4911aea936a47d9376fd3ab17e970cc543d1b68921886e7f64bd28308d1"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:177f55a886d74f1808763976ac4efd29b7ed15c69f4d838bbd74d9d09cf6fa86"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:964faa8a861d2664f0c7ab0c181af0bea66098b1919439815ca8803ef136fc4e"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4dd484681c15e6b9a977c785a345d3e378d72678fd5f1f3c0509608da24f2ac0"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f6d6cff3538391e8486a431569b77921adfcdef14eb18fbf19b7c0a5294d4e6a"},
    {file = "pydantic_core-2.20.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a6d511cc297ff0883bc3708b465ff82d7560193169a8b93260f74ecb0a5e08a7"},
    {file = "pydantic_core-2.20.1.tar.gz", hash = "sha256:26ca695eeee5f9f1aeeb211ffc12f10bcb6f71e2989988fda61dabd65db878d4"},
 ]
 [package.dependencies]
@ -2567,110 +2579,110 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 [[package]]
 name = "rpds-py"
-version = "0.18.1"
+version = "0.19.0"
 description = "Python bindings to Rust's persistent data structures (rpds)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"},
+    {file = "rpds_py-0.19.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:fb37bd599f031f1a6fb9e58ec62864ccf3ad549cf14bac527dbfa97123edcca4"},
-    {file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"},
+    {file = "rpds_py-0.19.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3384d278df99ec2c6acf701d067147320b864ef6727405d6470838476e44d9e8"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e54548e0be3ac117595408fd4ca0ac9278fde89829b0b518be92863b17ff67a2"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8eb488ef928cdbc05a27245e52de73c0d7c72a34240ef4d9893fdf65a8c1a955"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5da93debdfe27b2bfc69eefb592e1831d957b9535e0943a0ee8b97996de21b5"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79e205c70afddd41f6ee79a8656aec738492a550247a7af697d5bd1aee14f766"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:959179efb3e4a27610e8d54d667c02a9feaa86bbabaf63efa7faa4dfa780d4f1"},
-    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"},
+    {file = "rpds_py-0.19.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a6e605bb9edcf010f54f8b6a590dd23a4b40a8cb141255eec2a03db249bc915b"},
-    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"},
+    {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9133d75dc119a61d1a0ded38fb9ba40a00ef41697cc07adb6ae098c875195a3f"},
-    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"},
+    {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd36b712d35e757e28bf2f40a71e8f8a2d43c8b026d881aa0c617b450d6865c9"},
-    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"},
+    {file = "rpds_py-0.19.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354f3a91718489912f2e0fc331c24eaaf6a4565c080e00fbedb6015857c00582"},
-    {file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"},
+    {file = "rpds_py-0.19.0-cp310-none-win32.whl", hash = "sha256:ebcbf356bf5c51afc3290e491d3722b26aaf5b6af3c1c7f6a1b757828a46e336"},
-    {file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"},
+    {file = "rpds_py-0.19.0-cp310-none-win_amd64.whl", hash = "sha256:75a6076289b2df6c8ecb9d13ff79ae0cad1d5fb40af377a5021016d58cd691ec"},
-    {file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"},
+    {file = "rpds_py-0.19.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6d45080095e585f8c5097897313def60caa2046da202cdb17a01f147fb263b81"},
-    {file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"},
+    {file = "rpds_py-0.19.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5c9581019c96f865483d031691a5ff1cc455feb4d84fc6920a5ffc48a794d8a"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1540d807364c84516417115c38f0119dfec5ea5c0dd9a25332dea60b1d26fc4d"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9e65489222b410f79711dc3d2d5003d2757e30874096b2008d50329ea4d0f88c"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da6f400eeb8c36f72ef6646ea530d6d175a4f77ff2ed8dfd6352842274c1d8b"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37f46bb11858717e0efa7893c0f7055c43b44c103e40e69442db5061cb26ed34"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:071d4adc734de562bd11d43bd134330fb6249769b2f66b9310dab7460f4bf714"},
-    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"},
+    {file = "rpds_py-0.19.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9625367c8955e4319049113ea4f8fee0c6c1145192d57946c6ffcd8fe8bf48dd"},
-    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"},
+    {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e19509145275d46bc4d1e16af0b57a12d227c8253655a46bbd5ec317e941279d"},
-    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"},
+    {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4d438e4c020d8c39961deaf58f6913b1bf8832d9b6f62ec35bd93e97807e9cbc"},
-    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"},
+    {file = "rpds_py-0.19.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90bf55d9d139e5d127193170f38c584ed3c79e16638890d2e36f23aa1630b952"},
-    {file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"},
+    {file = "rpds_py-0.19.0-cp311-none-win32.whl", hash = "sha256:8d6ad132b1bc13d05ffe5b85e7a01a3998bf3a6302ba594b28d61b8c2cf13aaf"},
-    {file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"},
+    {file = "rpds_py-0.19.0-cp311-none-win_amd64.whl", hash = "sha256:7ec72df7354e6b7f6eb2a17fa6901350018c3a9ad78e48d7b2b54d0412539a67"},
-    {file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"},
+    {file = "rpds_py-0.19.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:5095a7c838a8647c32aa37c3a460d2c48debff7fc26e1136aee60100a8cd8f68"},
-    {file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"},
+    {file = "rpds_py-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f2f78ef14077e08856e788fa482107aa602636c16c25bdf59c22ea525a785e9"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7cc6cb44f8636fbf4a934ca72f3e786ba3c9f9ba4f4d74611e7da80684e48d2"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf902878b4af334a09de7a45badbff0389e7cf8dc2e4dcf5f07125d0b7c2656d"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:688aa6b8aa724db1596514751ffb767766e02e5c4a87486ab36b8e1ebc1aedac"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57dbc9167d48e355e2569346b5aa4077f29bf86389c924df25c0a8b9124461fb"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b4cf5a9497874822341c2ebe0d5850fed392034caadc0bad134ab6822c0925b"},
-    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"},
+    {file = "rpds_py-0.19.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8a790d235b9d39c70a466200d506bb33a98e2ee374a9b4eec7a8ac64c2c261fa"},
-    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"},
+    {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d16089dfa58719c98a1c06f2daceba6d8e3fb9b5d7931af4a990a3c486241cb"},
-    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"},
+    {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bc9128e74fe94650367fe23f37074f121b9f796cabbd2f928f13e9661837296d"},
-    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"},
+    {file = "rpds_py-0.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8f77e661ffd96ff104bebf7d0f3255b02aa5d5b28326f5408d6284c4a8b3248"},
-    {file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"},
+    {file = "rpds_py-0.19.0-cp312-none-win32.whl", hash = "sha256:5f83689a38e76969327e9b682be5521d87a0c9e5a2e187d2bc6be4765f0d4600"},
-    {file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"},
+    {file = "rpds_py-0.19.0-cp312-none-win_amd64.whl", hash = "sha256:06925c50f86da0596b9c3c64c3837b2481337b83ef3519e5db2701df695453a4"},
-    {file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"},
+    {file = "rpds_py-0.19.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:52e466bea6f8f3a44b1234570244b1cff45150f59a4acae3fcc5fd700c2993ca"},
-    {file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"},
+    {file = "rpds_py-0.19.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e21cc693045fda7f745c790cb687958161ce172ffe3c5719ca1764e752237d16"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b31f059878eb1f5da8b2fd82480cc18bed8dcd7fb8fe68370e2e6285fa86da6"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1dd46f309e953927dd018567d6a9e2fb84783963650171f6c5fe7e5c41fd5666"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34a01a4490e170376cd79258b7f755fa13b1a6c3667e872c8e35051ae857a92b"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcf426a8c38eb57f7bf28932e68425ba86def6e756a5b8cb4731d8e62e4e0223"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68eea5df6347d3f1378ce992d86b2af16ad7ff4dcb4a19ccdc23dea901b87fb"},
-    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"},
+    {file = "rpds_py-0.19.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dab8d921b55a28287733263c0e4c7db11b3ee22aee158a4de09f13c93283c62d"},
-    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"},
+    {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6fe87efd7f47266dfc42fe76dae89060038f1d9cb911f89ae7e5084148d1cc08"},
-    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"},
+    {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:535d4b52524a961d220875688159277f0e9eeeda0ac45e766092bfb54437543f"},
-    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"},
+    {file = "rpds_py-0.19.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8b1a94b8afc154fbe36978a511a1f155f9bd97664e4f1f7a374d72e180ceb0ae"},
-    {file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"},
+    {file = "rpds_py-0.19.0-cp38-none-win32.whl", hash = "sha256:7c98298a15d6b90c8f6e3caa6457f4f022423caa5fa1a1ca7a5e9e512bdb77a4"},
-    {file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"},
+    {file = "rpds_py-0.19.0-cp38-none-win_amd64.whl", hash = "sha256:b0da31853ab6e58a11db3205729133ce0df26e6804e93079dee095be3d681dc1"},
-    {file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"},
+    {file = "rpds_py-0.19.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:5039e3cef7b3e7a060de468a4a60a60a1f31786da94c6cb054e7a3c75906111c"},
-    {file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"},
+    {file = "rpds_py-0.19.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ab1932ca6cb8c7499a4d87cb21ccc0d3326f172cfb6a64021a889b591bb3045c"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2afd2164a1e85226fcb6a1da77a5c8896c18bfe08e82e8ceced5181c42d2179"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b1c30841f5040de47a0046c243fc1b44ddc87d1b12435a43b8edff7e7cb1e0d0"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f757f359f30ec7dcebca662a6bd46d1098f8b9fb1fcd661a9e13f2e8ce343ba1"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15e65395a59d2e0e96caf8ee5389ffb4604e980479c32742936ddd7ade914b22"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb0f6eb3a320f24b94d177e62f4074ff438f2ad9d27e75a46221904ef21a7b05"},
-    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"},
+    {file = "rpds_py-0.19.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b228e693a2559888790936e20f5f88b6e9f8162c681830eda303bad7517b4d5a"},
-    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"},
+    {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2575efaa5d949c9f4e2cdbe7d805d02122c16065bfb8d95c129372d65a291a0b"},
-    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"},
+    {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:5c872814b77a4e84afa293a1bee08c14daed1068b2bb1cc312edbf020bbbca2b"},
-    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"},
+    {file = "rpds_py-0.19.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:850720e1b383df199b8433a20e02b25b72f0fded28bc03c5bd79e2ce7ef050be"},
-    {file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"},
+    {file = "rpds_py-0.19.0-cp39-none-win32.whl", hash = "sha256:ce84a7efa5af9f54c0aa7692c45861c1667080814286cacb9958c07fc50294fb"},
-    {file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"},
+    {file = "rpds_py-0.19.0-cp39-none-win_amd64.whl", hash = "sha256:1c26da90b8d06227d7769f34915913911222d24ce08c0ab2d60b354e2d9c7aff"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:75969cf900d7be665ccb1622a9aba225cf386bbc9c3bcfeeab9f62b5048f4a07"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8445f23f13339da640d1be8e44e5baf4af97e396882ebbf1692aecd67f67c479"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5a7c1062ef8aea3eda149f08120f10795835fc1c8bc6ad948fb9652a113ca55"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:462b0c18fbb48fdbf980914a02ee38c423a25fcc4cf40f66bacc95a2d2d73bc8"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3208f9aea18991ac7f2b39721e947bbd752a1abbe79ad90d9b6a84a74d44409b"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3444fe52b82f122d8a99bf66777aed6b858d392b12f4c317da19f8234db4533"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb4bac7185a9f0168d38c01d7a00addece9822a52870eee26b8d5b61409213"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b130bd4163c93798a6b9bb96be64a7c43e1cec81126ffa7ffaa106e1fc5cef5"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:a707b158b4410aefb6b054715545bbb21aaa5d5d0080217290131c49c2124a6e"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dc9ac4659456bde7c567107556ab065801622396b435a3ff213daef27b495388"},
-    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"},
+    {file = "rpds_py-0.19.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:81ea573aa46d3b6b3d890cd3c0ad82105985e6058a4baed03cf92518081eec8c"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f148c3f47f7f29a79c38cc5d020edcb5ca780020fab94dbc21f9af95c463581"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b0906357f90784a66e89ae3eadc2654f36c580a7d65cf63e6a616e4aec3a81be"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f629ecc2db6a4736b5ba95a8347b0089240d69ad14ac364f557d52ad68cf94b0"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6feacd1d178c30e5bc37184526e56740342fd2aa6371a28367bad7908d454fc"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae8b6068ee374fdfab63689be0963333aa83b0815ead5d8648389a8ded593378"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78d57546bad81e0da13263e4c9ce30e96dcbe720dbff5ada08d2600a3502e526"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b6683a37338818646af718c9ca2a07f89787551057fae57c4ec0446dc6224b"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e8481b946792415adc07410420d6fc65a352b45d347b78fec45d8f8f0d7496f0"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bec35eb20792ea64c3c57891bc3ca0bedb2884fbac2c8249d9b731447ecde4fa"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:aa5476c3e3a402c37779e95f7b4048db2cb5b0ed0b9d006983965e93f40fe05a"},
-    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"},
+    {file = "rpds_py-0.19.0-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:19d02c45f2507b489fd4df7b827940f1420480b3e2e471e952af4d44a1ea8e34"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a3e2fd14c5d49ee1da322672375963f19f32b3d5953f0615b175ff7b9d38daed"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:93a91c2640645303e874eada51f4f33351b84b351a689d470f8108d0e0694210"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5b9fc03bf76a94065299d4a2ecd8dfbae4ae8e2e8098bbfa6ab6413ca267709"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5a4b07cdf3f84310c08c1de2c12ddadbb7a77568bcb16e95489f9c81074322ed"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba0ed0dc6763d8bd6e5de5cf0d746d28e706a10b615ea382ac0ab17bb7388633"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:474bc83233abdcf2124ed3f66230a1c8435896046caa4b0b5ab6013c640803cc"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329c719d31362355a96b435f4653e3b4b061fcc9eba9f91dd40804ca637d914e"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef9101f3f7b59043a34f1dccbb385ca760467590951952d6701df0da9893ca0c"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:0121803b0f424ee2109d6e1f27db45b166ebaa4b32ff47d6aa225642636cd834"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8344127403dea42f5970adccf6c5957a71a47f522171fafaf4c6ddb41b61703a"},
-    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"},
+    {file = "rpds_py-0.19.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:443cec402ddd650bb2b885113e1dcedb22b1175c6be223b14246a714b61cd521"},
-    {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"},
+    {file = "rpds_py-0.19.0.tar.gz", hash = "sha256:4fdc9afadbeb393b4bbbad75481e0ea78e4469f2e1d713a90811700830b553a9"},
 ]
 [[package]]
@ -2893,18 +2905,18 @@ files = [
 [[package]]
 name = "setuptools"
-version = "70.1.0"
+version = "70.3.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-70.1.0-py3-none-any.whl", hash = "sha256:d9b8b771455a97c8a9f3ab3448ebe0b29b5e105f1228bba41028be116985a267"},
+    {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"},
-    {file = "setuptools-70.1.0.tar.gz", hash = "sha256:01a1e793faa5bd89abc851fa15d0a0db26f160890c7102cd8dce643e886b47f5"},
+    {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"},
 ]
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 [[package]]
 name = "six"
@ -2919,17 +2931,20 @@ files = [
 [[package]]
 name = "sympy"
-version = "1.12.1"
+version = "1.13.0"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
+    {file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"},
-    {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
+    {file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"},
 ]
 [package.dependencies]
-mpmath = ">=1.1.0,<1.4.0"
+mpmath = ">=1.1.0,<1.4"
 [package.extras]
 dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 [[package]]
 name = "tbb"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "2.0.2"
+version = "2.0.4"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
 googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
 protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@ -6,14 +6,14 @@ colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_p
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2024.3.1 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
 googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
@ -32,17 +32,17 @@ prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
 protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
 py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2024.4.28 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.13.0 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.2 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.40.1 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.11.0 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/tests/utils/test_layers.py
+++ b/server/tests/utils/test_layers.py
@ -1,5 +1,5 @@
 import torch
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelEmbedding,
 )
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -200,31 +200,27 @@ def download_weights(
        try:
            import json
-            medusa_head = hf_hub_download(
+            config = hf_hub_download(
                model_id, revision=revision, filename="medusa_lm_head.safetensors"
            )
            medusa_config = hf_hub_download(
                model_id, revision=revision, filename="config.json"
            )
-            with open(medusa_config, "r") as f:
+            with open(config, "r") as f:
                config = json.load(f)
-            model_id = config["base_model_name_or_path"]
+            base_model_id = config.get("base_model_name_or_path", None)
-            revision = "main"
+            if base_model_id and base_model_id != model_id:
-            try:
+                try:
-                utils.weight_files(model_id, revision, extension)
+                    logger.info(f"Downloading parent model {base_model_id}")
-                logger.info(
+                    download_weights(
-                    f"Files for parent {model_id} are already present on the host. "
+                        model_id=base_model_id,
-                    "Skipping download."
+                        revision="main",
-                )
+                        extension=extension,
-                return
+                        auto_convert=auto_convert,
-            # Local files not found
+                        logger_level=logger_level,
-            except (
+                        json_output=json_output,
-                utils.LocalEntryNotFoundError,
+                        trust_remote_code=trust_remote_code,
-                FileNotFoundError,
+                    )
-                utils.EntryNotFoundError,
+                except Exception:
-            ):
+                    pass
                pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
@ -241,31 +237,6 @@ def download_weights(
            if not extension == ".safetensors" or not auto_convert:
                raise e
    elif (Path(model_id) / "medusa_lm_head.safetensors").exists():
        # Try to load as a local Medusa model
        try:
            import json
            medusa_head = Path(model_id) / "medusa_lm_head.safetensors"
            medusa_config = Path(model_id) / "config.json"
            with open(medusa_config, "r") as f:
                config = json.load(f)
            model_id = config["base_model_name_or_path"]
            revision = "main"
            try:
                utils.weight_files(model_id, revision, extension)
                logger.info(
                    f"Files for parent {model_id} are already present on the host. "
                    "Skipping download."
                )
                return
            # Local files not found
            except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
                pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
    elif (Path(model_id) / "adapter_config.json").exists():
        # Try to load as a local PEFT model
        try:
@ -276,14 +247,43 @@ def download_weights(
            return
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
    elif (Path(model_id) / "config.json").exists():
        # Try to load as a local Medusa model
        try:
            import json
            config = Path(model_id) / "config.json"
            with open(config, "r") as f:
                config = json.load(f)
            base_model_id = config.get("base_model_name_or_path", None)
            if base_model_id:
                try:
                    logger.info(f"Downloading parent model {base_model_id}")
                    download_weights(
                        model_id=base_model_id,
                        revision="main",
                        extension=extension,
                        auto_convert=auto_convert,
                        logger_level=logger_level,
                        json_output=json_output,
                        trust_remote_code=trust_remote_code,
                    )
                except Exception:
                    pass
        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
            pass
    # Try to see if there are local pytorch weights
    try:
        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
-        local_pt_files = utils.weight_files(model_id, revision, ".bin")
+        try:
            local_pt_files = utils.weight_files(model_id, revision, ".bin")
        except Exception:
            local_pt_files = utils.weight_files(model_id, revision, ".pt")
    # No local pytorch weights
-    except utils.LocalEntryNotFoundError:
+    except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
        if extension == ".safetensors":
            logger.warning(
                f"No safetensors weights found for model {model_id} at revision {revision}. "
--- a/server/text_generation_server/layers/init.py
+++ b/server/text_generation_server/layers/init.py
@ -0,0 +1,14 @@
 from text_generation_server.layers.tensor_parallel import (
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
    TensorParallelEmbedding,
 )
 from text_generation_server.layers.linear import (
    get_linear,
    FastLinear,
 )
 from text_generation_server.layers.speculative import SpeculativeHead
 # Just to add the `load` methods.
 from text_generation_server.layers.layernorm import load_layer_norm
 from text_generation_server.layers.conv import load_conv2d
--- a/server/text_generation_server/layers/awq/conversion_utils.py
+++ b/server/text_generation_server/layers/awq/conversion_utils.py
--- a/server/text_generation_server/layers/awq/quantize/qmodule.py
+++ b/server/text_generation_server/layers/awq/quantize/qmodule.py
--- a/server/text_generation_server/layers/bnb.py
+++ b/server/text_generation_server/layers/bnb.py
@ -0,0 +1,106 @@
 import torch
 from loguru import logger
 from functools import lru_cache
 import bitsandbytes as bnb
 from bitsandbytes.nn import Int8Params, Params4bit
@lru_cache(1)
 def warn_deprecate_bnb():
    logger.warning(
        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
    )
 class Linear8bitLt(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
        has_fp16_weights=True,
        memory_efficient_backward=False,
        threshold=0.0,
        index=None,
    ):
        super().__init__()
        assert (
            not memory_efficient_backward
        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
        self.state = bnb.MatmulLtState()
        self.index = index
        # Necessary for stacked layers
        self.state.threshold = threshold
        self.state.has_fp16_weights = has_fp16_weights
        self.state.memory_efficient_backward = memory_efficient_backward
        if threshold > 0.0 and not has_fp16_weights:
            self.state.use_pool = True
        self.weight = Int8Params(
            weight.data,
            has_fp16_weights=has_fp16_weights,
            requires_grad=has_fp16_weights,
        )
        self.weight.cuda(weight.device)
        self.bias = bias
    def init_8bit_state(self):
        self.state.CB = self.weight.CB
        self.state.SCB = self.weight.SCB
        self.weight.CB = None
        self.weight.SCB = None
    def forward(self, x: torch.Tensor):
        self.state.is_training = self.training
        if self.weight.CB is not None:
            self.init_8bit_state()
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)
        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
        if not self.state.has_fp16_weights:
            if self.state.CB is not None and self.state.CxB is not None:
                # we converted 8-bit row major to turing/ampere format in the first inference pass
                # we no longer need the row-major weight
                del self.state.CB
                self.weight.data = self.state.CxB
        return out
 class Linear4bit(torch.nn.Module):
    def __init__(self, weight, bias, quant_type):
        super().__init__()
        self.weight = Params4bit(
            weight.data,
            requires_grad=False,
            compress_statistics=True,
            quant_type=quant_type,
        )
        self.compute_dtype = None
        self.weight.cuda(weight.device)
        self.bias = bias
    def forward(self, x: torch.Tensor):
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
        if self.bias is not None and self.bias.dtype != x.dtype:
            self.bias.data = self.bias.data.to(x.dtype)
        if getattr(self.weight, "quant_state", None) is None:
            print(
                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
            )
        inp_dtype = x.dtype
        if self.compute_dtype is not None:
            x = x.to(self.compute_dtype)
        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
        out = bnb.matmul_4bit(
            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
        )
        out = out.to(inp_dtype)
        return out
--- a/server/text_generation_server/layers/conv.py
+++ b/server/text_generation_server/layers/conv.py
@ -0,0 +1,41 @@
 from accelerate import init_empty_weights
 import torch
@classmethod
 def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )
    conv2d.weight = torch.nn.Parameter(weight)
    conv2d.bias = torch.nn.Parameter(bias)
    return conv2d
@classmethod
 def load_conv2d_no_bias(
    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
 ):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        conv2d = cls(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
        )
    conv2d.weight = torch.nn.Parameter(weight)
    conv2d.bias = None
    return conv2d
 torch.nn.Conv2d.load = load_conv2d
 torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
--- a/server/text_generation_server/layers/eetq.py
+++ b/server/text_generation_server/layers/eetq.py
@ -0,0 +1,25 @@
 import torch
 from EETQ import quant_weights, w8_a16_gemm
 class EETQLinear(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        device = weight.device
        if weight.dtype != torch.float16:
            weight = weight.to(dtype=torch.float16)
        weight = torch.t(weight).contiguous().cpu()
        weight, scale = quant_weights(weight, torch.int8, False)
        self.weight = weight.cuda(device)
        self.scale = scale.cuda(device)
        self.bias = bias.cuda(device) if bias is not None else None
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = w8_a16_gemm(input, self.weight, self.scale)
        output = output + self.bias if self.bias is not None else output
        return output
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@ -0,0 +1,43 @@
 import torch
 def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
    device = weight.device
    # weight, scale = quant_weights(weight, torch.int8, False)
    finfo = torch.finfo(qdtype)
    # Calculate the scale as dtype max divided by absmax
    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
    # scale and clamp the tensor to bring it to
    # the representative range of float8 data type
    # (as default cast is unsaturated)
    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
    # Return both float8 data and the inverse scale (as float),
    # as both required as inputs to torch._scaled_mm
    qweight = qweight.to(qdtype)
    scale = scale.float().reciprocal()
    return qweight, scale
 class Fp8Linear(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.dtype = weight.dtype
        self.qweight, self.scale = fp8_quantize(weight)
        self.bias = bias if bias is not None else None
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        qinput, scale = fp8_quantize(input)
        output, _ = torch._scaled_mm(
            qinput,
            self.qweight.t(),
            out_dtype=self.dtype,
            scale_a=scale,
            scale_b=self.scale,
            bias=self.bias,
        )
        return output
--- a/server/text_generation_server/layers/gptq/init.py
+++ b/server/text_generation_server/layers/gptq/init.py
@ -0,0 +1,39 @@
 import os
 import torch
 from text_generation_server.utils.import_utils import (
    SYSTEM,
 )
 try:
    major, _minor = torch.cuda.get_device_capability()
 except Exception:
    major = 1
 HAS_EXLLAMA = False
 CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
 V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
 if os.getenv("DISABLE_EXLLAMA") == "True":
    HAS_EXLLAMA = False
 elif CAN_EXLLAMA:
    try:
        if V2:
            from text_generation_server.layers.gptq.exllamav2 import (
                QuantLinear as ExllamaQuantLinear,
                create_exllama_buffers,
                set_device,
            )
            HAS_EXLLAMA = "2"
        else:
            from text_generation_server.layers.gptq.exllama import (
                Ex4bitLinear as ExllamaQuantLinear,
                create_exllama_buffers,
                set_device,
            )
            HAS_EXLLAMA = "1"
    except ImportError:
        pass
 from text_generation_server.layers.gptq.quant_linear import QuantLinear
--- a/server/text_generation_server/layers/gptq/custom_autotune.py
+++ b/server/text_generation_server/layers/gptq/custom_autotune.py
--- a/server/text_generation_server/layers/gptq/exllama.py
+++ b/server/text_generation_server/layers/gptq/exllama.py
--- a/server/text_generation_server/layers/gptq/exllamav2.py
+++ b/server/text_generation_server/layers/gptq/exllamav2.py
--- a/server/text_generation_server/layers/gptq/exllamav2.py.rej
+++ b/server/text_generation_server/layers/gptq/exllamav2.py.rej
@ -0,0 +1,10 @@
 diff a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py	(rejected hunks)
@@ -119,6 +119,8 @@ def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
                 none_tensor,
                 temp_dq,
             )
 +    else:
 +        RuntimeError("Cannot create handle")
 DEVICE = None
--- a/server/text_generation_server/layers/gptq/quant_linear.py
+++ b/server/text_generation_server/layers/gptq/quant_linear.py
@ -0,0 +1,356 @@
 import math
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.cuda.amp import custom_fwd
 import triton
 import triton.language as tl
 from . import custom_autotune
 # code based https://github.com/fpgaminer/GPTQ-triton
@custom_autotune.autotune(
    configs=[
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 256,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 128,
                "BLOCK_SIZE_N": 128,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 128,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 128,
                "BLOCK_SIZE_N": 32,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 64,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=4,
            num_warps=4,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 128,
                "BLOCK_SIZE_K": 32,
                "GROUP_SIZE_M": 8,
            },
            num_stages=2,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": 64,
                "BLOCK_SIZE_K": 64,
                "GROUP_SIZE_M": 8,
            },
            num_stages=3,
            num_warps=8,
        ),
        triton.Config(
            {
                "BLOCK_SIZE_M": 32,
                "BLOCK_SIZE_N": 32,
                "BLOCK_SIZE_K": 128,
                "GROUP_SIZE_M": 8,
            },
            num_stages=2,
            num_warps=4,
        ),
    ],
    key=["M", "N", "K"],
    nearest_power_of_two=True,
    prune_configs_by={
        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
        "perf_model": None,
        "top_k": None,
    },
 )
@triton.jit
 def matmul_248_kernel(
    a_ptr,
    b_ptr,
    c_ptr,
    scales_ptr,
    zeros_ptr,
    g_ptr,
    M,
    N,
    K,
    bits,
    maxq,
    stride_am,
    stride_ak,
    stride_bk,
    stride_bn,
    stride_cm,
    stride_cn,
    stride_scales,
    stride_zeros,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE_M: tl.constexpr,
 ):
    """
    Compute the matrix multiplication C = A x B.
    A is of shape (M, K) float16
    B is of shape (K//8, N) int32
    C is of shape (M, N) float16
    scales is of shape (G, N) float16
    zeros is of shape (G, N) float16
    g_ptr is of shape (K) int32
    """
    infearure_per_bits = 32 // bits
    pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
    num_pid_in_group = GROUP_SIZE_M * num_pid_n
    group_id = pid // num_pid_in_group
    first_pid_m = group_id * GROUP_SIZE_M
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    pid_m = first_pid_m + (pid % group_size_m)
    pid_n = (pid % num_pid_in_group) // group_size_m
    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    a_ptrs = a_ptr + (
        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
    a_mask = offs_am[:, None] < M
    # b_ptrs is set up such that it repeats elements along the K axis 8 times
    b_ptrs = b_ptr + (
        (offs_k[:, None] // infearure_per_bits) * stride_bk
        + offs_bn[None, :] * stride_bn
    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
    g_ptrs = g_ptr + offs_k
    # shifter is used to extract the N bits of each element in the 32-bit word from B
    scales_ptrs = scales_ptr + offs_bn[None, :]
    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
    shifter = (offs_k % infearure_per_bits) * bits
    zeros_shifter = (offs_bn % infearure_per_bits) * bits
    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    for k in range(0, num_pid_k):
        g_idx = tl.load(g_ptrs)
        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
        scales = tl.load(
            scales_ptrs + g_idx[:, None] * stride_scales
        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
        zeros = tl.load(
            zeros_ptrs + g_idx[:, None] * stride_zeros
        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
        zeros = (zeros >> zeros_shifter[None, :]) & maxq
        zeros = (zeros + 1) & maxq  # eventually avoid overflow
        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
        # Now we need to unpack b (which is N-bit values) into 32-bit values
        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
        b = (b - zeros) * scales  # Scale and shift
        accumulator += tl.dot(a, b)
        a_ptrs += BLOCK_SIZE_K
        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
        g_ptrs += BLOCK_SIZE_K
    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
    tl.store(c_ptrs, accumulator, mask=c_mask)
 def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
    with torch.cuda.device(input.device):
        output = torch.empty(
            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
        )
        grid = lambda META: (
            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
        )
        matmul_248_kernel[grid](
            input,
            qweight,
            output,
            scales,
            qzeros,
            g_idx,
            input.shape[0],
            qweight.shape[1],
            input.shape[1],
            bits,
            maxq,
            input.stride(0),
            input.stride(1),
            qweight.stride(0),
            qweight.stride(1),
            output.stride(0),
            output.stride(1),
            scales.stride(0),
            qzeros.stride(0),
        )
        return output
 class QuantLinearFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
        return output
 class QuantLinear(nn.Module):
    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
        super().__init__()
        self.register_buffer("qweight", qweight)
        self.register_buffer("qzeros", qzeros)
        self.register_buffer("scales", scales)
        self.register_buffer("g_idx", g_idx)
        if bias is not None:
            self.register_buffer("bias", bias)
        else:
            self.bias = None
        if bits not in [2, 4, 8]:
            raise NotImplementedError("Only 2,4,8 bits are supported.")
        self.bits = bits
        self.maxq = 2**self.bits - 1
        self.groupsize = groupsize
        self.outfeatures = qweight.shape[1]
        self.infeatures = qweight.shape[0] * 32 // bits
    @classmethod
    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
        if bits not in [2, 4, 8]:
            raise NotImplementedError("Only 2,4,8 bits are supported.")
        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
        qzeros = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
            dtype=torch.int32,
        )
        scales = torch.zeros(
            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
        )
        g_idx = torch.tensor(
            [i // groupsize for i in range(infeatures)], dtype=torch.int32
        )
        if bias:
            bias = torch.zeros((outfeatures), dtype=torch.float16)
        else:
            bias = None
        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
    def pack(self, linear, scales, zeros, g_idx=None):
        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
        scales = scales.t().contiguous()
        zeros = zeros.t().contiguous()
        scale_zeros = zeros * scales
        self.scales = scales.clone().half()
        if linear.bias is not None:
            self.bias = linear.bias.clone().half()
        intweight = []
        for idx in range(self.infeatures):
            intweight.append(
                torch.round(
                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
                    / self.scales[self.g_idx[idx]]
                ).to(torch.int)[:, None]
            )
        intweight = torch.cat(intweight, dim=1)
        intweight = intweight.t().contiguous()
        intweight = intweight.numpy().astype(np.uint32)
        qweight = np.zeros(
            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
        )
        i = 0
        row = 0
        while row < qweight.shape[0]:
            if self.bits in [2, 4, 8]:
                for j in range(i, i + (32 // self.bits)):
                    qweight[row] |= intweight[j] << (self.bits * (j - i))
                i += 32 // self.bits
                row += 1
            else:
                raise NotImplementedError("Only 2,4,8 bits are supported.")
        qweight = qweight.astype(np.int32)
        self.qweight = torch.from_numpy(qweight)
        zeros -= 1
        zeros = zeros.numpy().astype(np.uint32)
        qzeros = np.zeros(
            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
        )
        i = 0
        col = 0
        while col < qzeros.shape[1]:
            if self.bits in [2, 4, 8]:
                for j in range(i, i + (32 // self.bits)):
                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
                i += 32 // self.bits
                col += 1
            else:
                raise NotImplementedError("Only 2,4,8 bits are supported.")
        qzeros = qzeros.astype(np.int32)
        self.qzeros = torch.from_numpy(qzeros)
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
        out = QuantLinearFunction.apply(
            x.reshape(-1, x.shape[-1]),
            self.qweight,
            self.scales,
            self.qzeros,
            self.g_idx,
            self.bits,
            self.maxq,
        )
        out = out + self.bias if self.bias is not None else out
        return out.reshape(out_shape)
--- a/server/text_generation_server/layers/gptq/quantize.py
+++ b/server/text_generation_server/layers/gptq/quantize.py
--- a/server/text_generation_server/layers/layernorm.py
+++ b/server/text_generation_server/layers/layernorm.py
@ -0,0 +1,185 @@
 import torch
 from torch import nn
 from accelerate import init_empty_weights
 from text_generation_server.utils.import_utils import (
    SYSTEM,
 )
 # Monkey patching
@classmethod
 def load_layer_norm(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    bias = weights.get_tensor(f"{prefix}.bias")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)
    ln.weight = torch.nn.Parameter(weight)
    ln.bias = torch.nn.Parameter(bias)
    return ln
@classmethod
 def load_layer_norm_no_bias(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)
    ln.weight = torch.nn.Parameter(weight)
    ln.bias = None
    return ln
 torch.nn.LayerNorm.load = load_layer_norm
 torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
 if SYSTEM == "cuda":
    import dropout_layer_norm
    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if hidden_states.shape[-1] > 8192:
                if residual is not None:
                    hidden_states += residual
                residual = hidden_states
                return super(FastLayerNorm, self).forward(hidden_states), residual
            else:
                (
                    normed_hidden_states,
                    residual,
                    *rest,
                ) = dropout_layer_norm.dropout_add_ln_fwd(
                    hidden_states,
                    residual,
                    self.weight,
                    self.bias,
                    None,
                    None,
                    None,
                    None,
                    0.0,
                    self.eps,
                    1.0,
                    0,
                    None,
                    False,
                    False,
                )
                if residual is None:
                    residual = hidden_states
                return normed_hidden_states, residual
 elif SYSTEM == "rocm":
    from vllm._C import ops
    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            return super().forward(hidden_states), residual
 elif SYSTEM == "xpu":
    import intel_extension_for_pytorch as ipex
    class FastLayerNorm(nn.LayerNorm):
        def forward(self, hidden_states, residual=None):
            res_out = hidden_states
            out = ipex.llm.functional.add_layer_norm(
                residual, hidden_states, self.weight, self.bias, self.eps, True
            )
            if residual is not None:
                res_out = residual
            return out, res_out
 class FastRMSNorm(nn.Module):
    def __init__(self, weight: torch.Tensor, eps: float):
        super().__init__()
        self.weight = nn.Parameter(weight)
        self.variance_epsilon = eps
    @classmethod
    def load(cls, prefix, weights, eps=1e-6):
        weight = weights.get_tensor(f"{prefix}.weight")
        return cls(weight, eps)
    def forward(self, hidden_states, residual=None):
        if SYSTEM == "xpu":
            residual_out = hidden_states
            out = ipex.llm.functional.add_rms_norm(
                residual,
                hidden_states,
                self.weight,
                None,
                self.variance_epsilon,
                True,
            )
            if residual is not None:
                residual_out = residual
            return out, residual_out
        elif hidden_states.shape[-1] > 8192:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            hidden_states = hidden_states.to(torch.float32)
            variance = hidden_states.pow(2).mean(-1, keepdim=True)
            hidden_states = hidden_states * torch.rsqrt(
                variance + self.variance_epsilon
            )
            # convert into half-precision if necessary
            if self.weight.dtype in [torch.float16, torch.bfloat16]:
                hidden_states = hidden_states.to(self.weight.dtype)
            return self.weight * hidden_states, residual
        elif SYSTEM == "cuda":
            # faster post attention rms norm
            (
                normed_hidden_states,
                res,
                *rest,
            ) = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                None,
                None,
                None,
                None,
                None,
                0.0,
                self.variance_epsilon,
                1.0,
                0,
                None,
                False,
                True,  # Activate RMSNorm
            )
            if res is None:
                res = hidden_states
            return normed_hidden_states, res
        elif SYSTEM == "rocm":
            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            out = torch.empty_like(hidden_states)
            ops.rms_norm(
                out,
                hidden_states,
                self.weight.data,
                self.variance_epsilon,
            )
            return out, residual
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )
--- a/server/text_generation_server/layers/linear.py
+++ b/server/text_generation_server/layers/linear.py
@ -0,0 +1,216 @@
 import torch
 from torch.nn import functional as F
 from text_generation_server.utils.import_utils import SYSTEM
 if SYSTEM == "rocm":
    try:
        from vllm import _custom_C
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
 class FastLinear(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.weight = torch.nn.Parameter(weight, requires_grad=False)
        if bias is not None:
            self.bias = torch.nn.Parameter(bias, requires_grad=False)
        else:
            self.bias = None
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        return F.linear(input, self.weight, self.bias)
 class FastLinearROCm(torch.nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        self.weight = torch.nn.Parameter(weight)
        if bias is not None:
            self.bias = torch.nn.Parameter(bias)
        else:
            self.bias = None
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_tensor(f"{prefix}.weight")
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(weight, bias)
    def forward(self, inp: torch.Tensor) -> torch.Tensor:
        weight = self.weight
        bias = self.bias
        if SYSTEM == "rocm" and inp.numel() // inp.shape[-1] == 1:
            batched = False
            inp_shape = inp.shape
            if inp.dim() == 3:
                inp = inp.view(-1, inp_shape[-1])
                batched = True
            m, k = weight.shape[0], inp_shape[1]
            out = torch.empty(
                inp_shape[0], weight.shape[0], dtype=inp.dtype, device="cuda"
            )
            if (k == 8192 and (m == 1280 or m == 7168)) or (k == 3584 and m == 8192):
                _custom_C.LLMM1(weight, inp, out, 8)
            elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
                _custom_C.LLMM1(weight, inp, out, 4)
            else:
                out = F.linear(inp, weight)
            if batched:
                out.view(*inp_shape[:-1], out.shape[-1])
            if bias is not None:
                out = out + bias
            return out
        return F.linear(inp, self.weight, self.bias)
 def get_linear(weight, bias, quantize):
    if quantize is None:
        if SYSTEM == "rocm":
            linear = FastLinearROCm(weight, bias)
        else:
            linear = FastLinear(weight, bias)
    elif quantize == "eetq":
        try:
            from text_generation_server.layers.eetq import EETQLinear
            linear = EETQLinear(weight, bias)
        except ImportError:
            raise ImportError(
                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
            )
    elif quantize == "fp8":
        from text_generation_server.layers.fp8 import Fp8Linear
        linear = Fp8Linear(weight, bias)
    elif quantize == "bitsandbytes":
        try:
            from text_generation_server.layers.bnb import (
                warn_deprecate_bnb,
                Linear8bitLt,
            )
        except ImportError:
            raise NotImplementedError(
                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
            )
        warn_deprecate_bnb()
        linear = Linear8bitLt(
            weight,
            bias,
            has_fp16_weights=False,
            threshold=6.0,
        )
        if bias is not None:
            linear.bias = nn.Parameter(bias)
    elif quantize == "bitsandbytes-fp4":
        try:
            from text_generation_server.layers.bnb import Linear4bit
        except ImportError:
            raise NotImplementedError(
                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
            )
        linear = Linear4bit(
            weight,
            bias,
            quant_type="fp4",
        )
    elif quantize == "bitsandbytes-nf4":
        try:
            from text_generation_server.layers.bnb import Linear4bit
        except ImportError:
            raise NotImplementedError(
                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
            )
        linear = Linear4bit(
            weight,
            bias,
            quant_type="nf4",
        )
    elif quantize == "gptq":
        try:
            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
        except Exception:
            raise NotImplementedError(
                f"The passed weight is not `gptq` compatible, loader needs to be updated."
            )
        if use_exllama:
            try:
                from text_generation_server.layers.gptq import (
                    ExllamaQuantLinear,
                )
            except ImportError:
                raise NotImplementedError(
                    f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
                )
            linear = ExllamaQuantLinear(
                qweight, qzeros, scales, g_idx, bias, bits, groupsize
            )
        else:
            from text_generation_server.layers.gptq.quant_linear import QuantLinear
            linear = QuantLinear(
                qweight,
                qzeros,
                scales,
                g_idx,
                bias,
                bits,
                groupsize,
            )
    elif quantize == "awq":
        try:
            qweight, qzeros, scales, _, bits, groupsize, _ = weight
        except Exception:
            raise NotImplementedError(
                f"The passed weight is not `awq` compatible, loader needs to be updated."
            )
        if SYSTEM == "rocm":
            raise NotImplementedError(
                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
                "to use Exllama/GPTQ kernels for AWQ inference."
            )
        try:
            from text_generation_server.layers.awq.quantize.qmodule import WQLinear
            linear = WQLinear(
                w_bit=bits,
                group_size=groupsize,
                qweight=qweight,
                qzeros=qzeros,
                scales=scales,
                bias=bias is not None,
            )
        except ImportError:
            raise NotImplementedError(
                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
            )
    else:
        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
    return linear
--- a/server/text_generation_server/layers/medusa.py
+++ b/server/text_generation_server/layers/medusa.py
@ -0,0 +1,189 @@
 import torch
 from torch import nn
 from typing import Tuple, Optional
 from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.layers.linear import FastLinear
 from text_generation_server.layers.tensor_parallel import (
    TensorParallelHead,
    TensorParallelColumnLinear,
 )
 class ResBlock(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.linear = FastLinear.load(
            config, prefix=f"{prefix}.linear", weights=weights, bias=True
        )
        self.act = torch.nn.SiLU()
    def forward(self, x):
        return x + self.act(self.linear(x))
 class MedusaModel(torch.nn.Module):
    def __init__(self, config, medusa_config, weights):
        super().__init__()
        self.heads = torch.nn.ModuleList(
            [
                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
                for i in range(get_speculate())
            ]
        )
    def forward(self, x):
        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
        return speculative_logits
 class MedusaHead(torch.nn.Module):
    def __init__(self, config, medusa_config, prefix, weights):
        super().__init__()
        self.blocks = torch.nn.ModuleList(
            [
                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
                for i in range(medusa_config["medusa_num_layers"])
            ]
        )
        n = len(self.blocks)
        self.out = FastLinear.load(
            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
        )
    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        x = self.out(x)
        return x
 class MedusaHeadV1(nn.Module):
    def __init__(self, lm_head, medusa):
        super().__init__()
        self.lm_head = lm_head
        self.medusa = medusa
    @staticmethod
    def load(config, prefix: str, weights):
        from pathlib import Path
        from safetensors import safe_open
        import json
        speculator = config.speculator
        path = speculator["path"]
        medusa_config = str(Path(path) / "config.json")
        for fname in speculator["model_paths"]:
            filename = str(Path(path) / fname)
            with open(medusa_config, "r") as f:
                medusa_config = json.load(f)
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing and routing[k] != filename:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        medusa = MedusaModel(config, medusa_config, weights)
        lm_head = TensorParallelHead.load(config, prefix, weights)
        return MedusaHeadV1(lm_head, medusa)
    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
        # If we have too many tokens, we skip speculative logits
        if input.shape[0] > 128:
            return logits, None
        speculative_logits = self.medusa(input)
        return logits, speculative_logits
 class MedusaHeadV2(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        from pathlib import Path
        from safetensors import safe_open
        import json
        speculator_path = config.speculator["path"]
        medusa_config = str(Path(speculator_path) / "config.json")
        filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")
        with open(medusa_config, "r") as f:
            medusa_config = json.load(f)
        routing = weights.routing
        with safe_open(filename, framework="pytorch") as f:
            for k in f.keys():
                if k in routing and routing[k] != filename:
                    raise RuntimeError(
                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                    )
                routing[k] = filename
        self.n_medusa_heads = get_speculate()
        assert medusa_config["medusa_num_layers"] == 1
        self.linear = TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
            dim=0,
            weights=weights,
            bias=True,
        )
        self.process_group = weights.process_group
        self.world_size = self.process_group.size()
        self.rank = self.process_group.rank()
        self.act = torch.nn.SiLU()
        self.lm_head = TensorParallelHead.load(config, prefix, weights)
    def forward(self, x):
        # If we have too many tokens, we skip speculative logits
        if x.shape[0] > 128:
            logits = self.lm_head(x)
            return logits, None
        size = x.shape[-1]
        block_size = (size + self.world_size - 1) // self.world_size
        start = self.rank * block_size
        stop = (self.rank + 1) * block_size
        x_block = x[:, start:stop]
        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
        medusa_res = self.act(self.linear(x)).reshape(
            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
        )
        # Apply all residual medusa heads
        output = x[:, start:stop].unsqueeze(-2) + medusa_res
        # Gather medusa heads
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)
        # Stack x and medusa residual x
        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
        # Compute lm head on x + medusa residual x
        logits = self.lm_head(stacked_x)
        # Finally, split logits from speculative logits
        logits, speculative_logits = torch.split(
            logits, [1, self.n_medusa_heads], dim=-2
        )
        # Squeeze added dimension
        logits = logits.squeeze(-2)
        return logits, speculative_logits
--- a/server/text_generation_server/layers/mlp.py
+++ b/server/text_generation_server/layers/mlp.py
@ -0,0 +1,176 @@
 import torch
 import math
 from torch import nn
 from torch.nn import functional as F
 from typing import Optional, Tuple
 from text_generation_server.layers import TensorParallelEmbedding, FastLinear
 from text_generation_server.layers.tensor_parallel import TensorParallelHead
 from text_generation_server.utils.speculate import get_speculate
 class MLPSpeculatorLayerNorm(nn.Module):
    """
    A L2 normalization implementation
    ...
    Args
    ----
    normalized_shape : int
        Dimensionality of input data (size of final tensor axis)
    elementwise_scale_weight : torch.Tensor
        learned scaling term after normalization?
    elementwise_shift_bias : torch.Tensor
        learned bias term after normalization?
    eps : float
        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
    """
    def __init__(
        self,
        prefix,
        config,
        weights,
        eps=1e-06,
    ):
        super(MLPSpeculatorLayerNorm, self).__init__()
        self.weight = weights.get_tensor(f"{prefix}.weight")
        self.bias = weights.get_tensor(f"{prefix}.bias")
        self.eps = eps
    def forward(self, x):
        xf = x
        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
        x = xf.type_as(x)
        x = self.weight * x
        x = x + self.bias
        return x
 class MLPSpeculatorModel(torch.nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.config = config
        self.n_predict = get_speculate()
        self.hidden_size = config.hidden_size
        self.emb = nn.ModuleList(
            [
                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
                for i in range(self.n_predict)
            ]
        )
        self.proj = [
            FastLinear.load(
                config,
                prefix=f"{prefix}.proj.{i}",
                weights=weights,
                bias=False,
            )
            for i in range(self.n_predict)
        ]
        self.head = nn.ModuleList(
            [
                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
                for i in range(self.n_predict)
            ]
        )
        self.ln = nn.ModuleList(
            [
                MLPSpeculatorLayerNorm(
                    prefix=f"{prefix}.ln.{i}",
                    config=config,
                    weights=weights,
                )
                for i in range(self.n_predict)
            ]
        )
        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
        self.state_weight = 0.5 ** (0.5 / self.n_predict)
        self.emb_weight = math.sqrt(1 - self.state_weight**2)
        self.activation = nn.GELU()
        # TODO
        self.vsize = config.vocab_size
        self.inner_dim = config.speculator_config["inner_dim"]
        self.top_k_tokens_per_head = [1] * self.n_predict
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_ids: torch.Tensor,
    ):
        top_k_tokens_per_head = self.top_k_tokens_per_head
        # k indicates # of candidates
        # h indicates # of generated tokens
        state = hidden_states
        b = state.size(0)
        ind = input_ids.unsqueeze(0)
        all_probs = torch.empty(
            b, self.n_predict, self.vsize, device=state.device
        )  # b k h v
        assert (
            len(top_k_tokens_per_head) == self.n_predict
        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
        for i in range(self.n_predict):
            # Project and predict
            z = self.emb[i](ind)
            z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2))  # b k d
            state = self.proj[i](state) * self.state_weight + z
            state = self.activation(self.ln[i](state))  # b k d
            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
            # Update candidate set with new predictions
            # Update distribution set with new logits
            all_probs[:, i] = probs.exp()
            # Update state, log_probs and ind for new predictions
            state = state.unsqueeze(2).expand(
                -1, -1, top_k_tokens_per_head[i], -1
            )  # b k k' d
            state = state.reshape(-1, b, state.size(3))  # b kk' d
            ind = preds.view(-1, b)  # b kk'
        speculative_logits = all_probs
        return speculative_logits
 class MLPSpeculatorHead(nn.Module):
    def __init__(self, lm_head, mlp_speculator):
        super().__init__()
        self.lm_head = lm_head
        self.mlp_speculator = mlp_speculator
    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
        # If we have too many tokens, we skip speculative logits
        if input.shape[0] > 128:
            return logits, None
        input_ids = logits.argmax(dim=-1)
        speculative_logits = self.mlp_speculator(input, input_ids)
        return logits, speculative_logits
    @staticmethod
    def load(config, prefix: str, weights):
        from pathlib import Path
        from safetensors import safe_open
        speculator_path = config.speculator["path"]
        for fname in config.speculator["model_paths"]:
            filename = str(Path(speculator_path) / fname)
            routing = weights.routing
            with safe_open(filename, framework="pytorch") as f:
                for k in f.keys():
                    if k in routing and routing[k] != filename:
                        raise RuntimeError(
                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
                        )
                    routing[k] = filename
        mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
        lm_head = TensorParallelHead.load(config, prefix, weights)
        return MLPSpeculatorHead(lm_head, mlp_speculator)
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
@ -0,0 +1,421 @@
 import os
 import torch
 from torch import nn
 from text_generation_server.utils.import_utils import SYSTEM
 if SYSTEM == "cuda":
    from flash_attn.layers.rotary import RotaryEmbedding
    import rotary_emb
 elif SYSTEM == "rocm":
    from vllm._C import ops
 elif SYSTEM == "xpu":
    import intel_extension_for_pytorch as ipex
 def _create_inv_freq(dim, base, device):
    inv_freq = 1.0 / (
        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
    )
    return inv_freq
 def _get_rope_config(config):
    if os.getenv("ROPE_SCALING", None) is not None:
        rope_scaling = {
            "type": os.environ["ROPE_SCALING"],
            "factor": float(os.environ["ROPE_FACTOR"]),
        }
        return rope_scaling
    return getattr(config, "rope_scaling", None)
 class PositionRotaryEmbedding(nn.Module):
    def __init__(self, inv_freq, scaling_factor):
        super().__init__()
        self.inv_freq = inv_freq
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.scaling_factor = scaling_factor
        self.dynamic_args = None
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
        if SYSTEM == "cuda":
            rotary_dim = cos.shape[-1]
            q1 = query[..., :rotary_dim]
            q2 = query[..., rotary_dim : 2 * rotary_dim]
            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
            k1 = key[..., :rotary_dim]
            k2 = key[..., rotary_dim : 2 * rotary_dim]
            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
        elif SYSTEM == "rocm":
            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
            head_size = query.shape[-1]
            # Inplace operation, updating query and key.
            ops.rotary_embedding(query, key, head_size, cos, sin, True)
        elif SYSTEM == "xpu":
            ipex.llm.functional.rotary_embedding(
                query, key, sin, cos, query.size(-1), True
            )
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
            )
    @classmethod
    def static(cls, config, dim, base, device):
        inv_freq = _create_inv_freq(dim, base, device)
        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if rope_scaling is not None:
            if rope_scaling["type"] == "linear":
                pass
            elif rope_scaling["type"] == "dynamic":
                scaling_factor = rope_scaling["factor"]
                return DynamicPositionRotaryEmbedding(
                    dim=dim,
                    max_position_embeddings=config.max_position_embeddings,
                    base=base,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
            elif rope_scaling["type"] == "yarn":
                scaling_factor = rope_scaling["factor"]
                return YarnPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                    extrapolation_factor=1,
                    attn_factor=1,
                    beta_fast=32,
                    beta_slow=1,
                )
            elif rope_scaling["type"] == "su":
                short_factor = torch.tensor(
                    rope_scaling["short_factor"], dtype=torch.float32, device=device
                )
                short_inv_freq = 1.0 / (
                    short_factor
                    * base
                    ** (
                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
                        / dim
                    )
                )
                long_factor = torch.tensor(
                    rope_scaling["long_factor"], dtype=torch.float32, device=device
                )
                long_inv_freq = 1.0 / (
                    long_factor
                    * base
                    ** (
                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
                        / dim
                    )
                )
                original_max_position_embeddings = (
                    config.original_max_position_embeddings
                )
                max_position_embeddings = config.max_position_embeddings
                if max_position_embeddings <= original_max_position_embeddings:
                    scaling_factor = 1.0
                else:
                    scale = max_position_embeddings / original_max_position_embeddings
                    scaling_factor = math.sqrt(
                        1 + math.log(scale) / math.log(original_max_position_embeddings)
                    )
                return SuRotaryEmbedding(
                    short_inv_freq=short_inv_freq,
                    long_inv_freq=long_inv_freq,
                    scaling_factor=scaling_factor,
                    original_max_position_embeddings=original_max_position_embeddings,
                )
            else:
                raise NotImplementedError(
                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                )
        return cls(inv_freq, scaling_factor)
    @classmethod
    def load(cls, config, prefix, weights):
        # XXX: Always load this in float32 !
        dtype = weights.dtype
        weights.dtype = torch.float32
        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
        weights.dtype = dtype
        scaling_factor = None
        rope_scaling = _get_rope_config(config)
        if rope_scaling is not None:
            scaling_factor = rope_scaling["factor"]
            if rope_scaling["type"] == "linear":
                pass
            elif rope_scaling["type"] == "dynamic":
                return DynamicPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=config.max_position_embeddings,
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                )
            elif rope_scaling["type"] == "yarn":
                return YarnPositionRotaryEmbedding(
                    dim=2 * inv_freq.shape[0],
                    max_position_embeddings=rope_scaling[
                        "original_max_position_embeddings"
                    ],
                    base=10000.0,
                    device=inv_freq.device,
                    scaling_factor=scaling_factor,
                    extrapolation_factor=1,
                    attn_factor=1,
                    beta_fast=32,
                    beta_slow=1,
                )
            else:
                raise NotImplementedError(
                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
                )
        return cls(inv_freq, scaling_factor)
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            if self.scaling_factor is not None:
                t /= self.scaling_factor
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
        """
        Return cos and sin for the asked position ids
        """
        if SYSTEM == "rocm":
            # For RoCm, we always use float cos/sin to avoid a cast.
            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
            dtype = torch.float32
        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
        cos = torch.index_select(self._cos_cached, 0, position_ids)
        sin = torch.index_select(self._sin_cached, 0, position_ids)
        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
        return cos.unsqueeze(1), sin.unsqueeze(1)
 class SuRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        short_inv_freq,
        long_inv_freq,
        scaling_factor,
        original_max_position_embeddings,
    ):
        super(PositionRotaryEmbedding, self).__init__()
        self.short_inv_freq = short_inv_freq
        self.long_inv_freq = long_inv_freq
        self.scaling_factor = scaling_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        self._seq_len_cached = 0
        self._cos_cached = None
        self._sin_cached = None
        self._cos_k_cached = None
        self._sin_k_cached = None
        self.dynamic_args = None
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            self._seq_len_cached = seqlen
            if seqlen > self.original_max_position_embeddings:
                inv_freq = self.long_inv_freq
            else:
                inv_freq = self.short_inv_freq
            t = torch.arange(seqlen, device=device, dtype=inv_freq.dtype)
            if self.scaling_factor is not None:
                t /= self.scaling_factor
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
 class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
        inv_freq = _create_inv_freq(dim, base, device)
        super().__init__(inv_freq, scaling_factor)
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            if seqlen > self.max_position_embeddings:
                newbase = self.base * (
                    (self.scaling_factor * seqlen / self.max_position_embeddings)
                    - (self.scaling_factor - 1)
                ) ** (self.dim / (self.dim - 2))
                self.inv_freq = _create_inv_freq(
                    self.dim, newbase, self.inv_freq.device
                )
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = torch.cos(freqs).to(dtype)
            self._sin_cached = torch.sin(freqs).to(dtype)
 # Inverse dim formula to find dim based on number of rotations
 import math
 def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
        2 * math.log(base)
    )
 # Find dim range bounds based on rotations
 def find_correction_range(
    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
 ):
    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 def linear_ramp_mask(min, max, dim):
    if min == max:
        max += 0.001  # Prevent singularity
    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
    ramp_func = torch.clamp(linear_func, 0, 1)
    return ramp_func
 def get_mscale(scale=1):
    if scale <= 1:
        return 1.0
    return 0.1 * math.log(scale) + 1.0
 class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
    def __init__(
        self,
        dim,
        max_position_embeddings,
        base,
        device,
        scaling_factor,
        *,
        extrapolation_factor,
        attn_factor,
        beta_fast,
        beta_slow,
    ):
        inv_freq = _create_inv_freq(dim, base, device)
        super().__init__(inv_freq, scaling_factor)
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.extrapolation_factor = extrapolation_factor
        self.attn_factor = attn_factor
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        self.mscale = float(
            get_mscale(self.scaling_factor) * self.attn_factor
        )  # Get n-d magnitude scaling corrected for interpolation
    def _update_cos_sin_cache(self, dtype, device, seqlen):
        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seqlen > self._seq_len_cached
            or self._cos_cached.device != device
            or self._cos_cached.dtype != dtype
        ):
            if seqlen > self.max_position_embeddings:
                inv_freq_extrapolation = _create_inv_freq(
                    self.dim, self.base, self.inv_freq.device
                )
                freqs = 1.0 / inv_freq_extrapolation
                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
                low, high = find_correction_range(
                    self.beta_fast,
                    self.beta_slow,
                    self.dim,
                    self.base,
                    self.max_position_embeddings,
                )
                inv_freq_mask = (
                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
                inv_freq = (
                    inv_freq_interpolation * (1 - inv_freq_mask)
                    + inv_freq_extrapolation * inv_freq_mask
                )
                self.inv_freq = inv_freq
                self.mscale = float(
                    get_mscale(self.scaling_factor) * self.attn_factor
                )  # Get n-d magnitude scaling corrected for interpolation
            self._seq_len_cached = seqlen
            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
            # Don't do einsum, it converts fp32 to fp16
            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
--- a/server/text_generation_server/layers/speculative.py
+++ b/server/text_generation_server/layers/speculative.py
@ -0,0 +1,52 @@
 import torch
 import json
 from typing import Tuple, Optional
 from text_generation_server.layers.tensor_parallel import TensorParallelHead
 from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
 from text_generation_server.layers.mlp import MLPSpeculatorHead
 class SpeculativeHead(torch.nn.Module):
    def __init__(self, lm_head, speculator):
        super().__init__()
        self.head = lm_head
        self.speculator = speculator
    @staticmethod
    def load(config, prefix: str, weights):
        speculator = config.speculator
        if speculator:
            speculator_path = config.speculator["path"]
            speculator_config = str(speculator_path / "config.json")
            with open(speculator_config, "r") as f:
                speculator_config = json.load(f)
            config.speculator_config = speculator_config
            try:
                architecture = speculator_config["architectures"][0]
                if architecture == "MLPSpeculatorPreTrainedModel":
                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
                else:
                    speculator = None
            except KeyError:
                try:
                    speculator = MedusaHeadV1.load(config, prefix, weights)
                except:
                    speculator = MedusaHeadV2(config, prefix, weights)
            lm_head = None
        else:
            lm_head = TensorParallelHead.load(config, prefix, weights)
            speculator = None
        return SpeculativeHead(lm_head, speculator)
    def forward(
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if self.speculator is not None:
            return self.speculator(input)
        assert self.head is not None
        logits = self.head(input)
        return logits, None
--- a/server/text_generation_server/layers/tensor_parallel.py
+++ b/server/text_generation_server/layers/tensor_parallel.py
@ -0,0 +1,188 @@
 import torch
 from torch.nn import functional as F
 from typing import List
 from text_generation_server.layers.linear import get_linear, FastLinear
 class SuperLayer(torch.nn.Module):
    def __init__(self, linear):
        super().__init__()
        self.linear = linear
    def forward(self, x):
        return self.linear.forward(x)
 class TensorParallelHead(SuperLayer):
    def __init__(self, linear, process_group, should_gather: bool):
        super().__init__(linear)
        self.process_group = process_group
        self.should_gather = should_gather
    @staticmethod
    def load(config, prefix: str, weights):
        if weights.process_group.size() > 1:
            try:
                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
                should_gather = True
            except AssertionError:
                # If the vocab size is not divisible by number of shards
                # just load the entire thing.
                weight = weights.get_tensor(f"{prefix}.weight")
                should_gather = False
        else:
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False
        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
        if config.quantize in ["gptq", "awq", "eetq"]:
            quantize = None
        else:
            quantize = config.quantize
        return TensorParallelHead(
            get_linear(weight, bias=None, quantize=quantize),
            process_group=weights.process_group,
            should_gather=should_gather,
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if not self.should_gather:
            return super().forward(input)
        world_size = self.process_group.size()
        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
            out_dim = self.linear.weight.shape[0]
            if input.shape[0] == 1:
                world_out = input.new_empty(1, out_dim * world_size)
                local_out = input.new_empty(1, out_dim)
                gather_input = local_out
            else:
                world_out = input.new_empty(out_dim * world_size, input.shape[0])
                gather_input = input.new_empty(out_dim, input.shape[0])
                local_out = gather_input.T
            torch.mm(input, self.linear.weight.T, out=local_out)
            torch.distributed.all_gather_into_tensor(
                world_out, gather_input, group=self.process_group
            )
            if input.shape[0] == 1:
                return world_out
            return world_out.T
        output = super().forward(input)
        world_output = [
            torch.empty_like(output) for _ in range(self.process_group.size())
        ]
        torch.distributed.all_gather(world_output, output, group=self.process_group)
        world_output = torch.cat(world_output, dim=-1)
        return world_output
 class TensorParallelColumnLinear(SuperLayer):
    @classmethod
    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_gate_up(
            prefix, quantize=config.quantize
        )
        if bias:
            raise NotImplementedError("packed_gate_up only implemented without bias")
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
    @classmethod
    def load_qkv(cls, config, prefix: str, weights, bias: bool):
        """Specific method when the QKV was joined after the fact"""
        weight = weights.get_weights_col_packed_qkv(prefix, quantize=config.quantize)
        if bias:
            raise NotImplementedError("packed_qkv only implemented for baichuan")
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        return cls.load_multi(config, [prefix], weights, bias, dim=0)
    @classmethod
    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
        weight = weights.get_multi_weights_col(
            prefixes, quantize=config.quantize, dim=dim
        )
        if bias:
            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
            bias = torch.cat(b, dim=dim)
        else:
            bias = None
        linear = get_linear(weight, bias, config.quantize)
        return cls(linear)
 class TensorParallelRowLinear(SuperLayer):
    def __init__(self, linear, process_group):
        super().__init__(linear)
        self.process_group = process_group
    @classmethod
    def load(cls, config, prefix: str, weights, bias: bool):
        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
        if bias and weights.process_group.rank() == 0:
            # Rank is only on the first rank process
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        return cls(
            get_linear(weight, bias, config.quantize),
            process_group=weights.process_group,
        )
    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
        out = super().forward(input)
        if self.process_group.size() > 1 and reduce:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
 class TensorParallelEmbedding(torch.nn.Module):
    def __init__(self, prefix: str, weights, reduce=True):
        super().__init__()
        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
        process_group = weights.process_group
        world_size = process_group.size()
        rank = process_group.rank()
        block_size = (num_embeddings + world_size - 1) // world_size
        self.min_id = rank * block_size
        self.max_id = min(num_embeddings, (rank + 1) * block_size)
        self.null_idx = weight.shape[
            0
        ]  # Usually block_size, might be less in non even vocab_size.
        self.process_group = weights.process_group
        self.reduce = reduce
        """Additional 0 entry used for masking"""
        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
        # translate for [0, self.max_id - self.min_id[
        input = torch.where(
            (self.min_id > input) | (input >= self.max_id),
            self.null_idx,
            input - self.min_id,
        )
        out = torch.nn.functional.embedding(input, self.weight)
        if self.reduce and self.process_group.size() > 1:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -1,9 +1,10 @@
 import torch
 import os
 from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import modeling_auto
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional
 from pathlib import Path
@ -15,6 +16,12 @@ from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM
 from text_generation_server.models.bloom import BLOOM
 from text_generation_server.models.starcoder import StarCoder
 from text_generation_server.models.vlm_causal_lm import VlmCausalLM
 from text_generation_server.models.custom_modeling.llava_next import (
    LlavaNextForConditionalGeneration,
 )
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
@ -40,8 +47,9 @@ def get_model(
    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
    model_type = config_dict.get("model_type", None)
-    use_medusa = None
+    speculator = None
    if "medusa_num_heads" in config_dict:
        medusa_model_id = model_id
        medusa_revision = revision
@ -61,6 +69,8 @@ def get_model(
        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        # Reload model type from parent.
        model_type = config_dict.get("model_type", None)
        is_local = Path(medusa_model_id).exists()
        if not is_local:
            medusa_config = hf_hub_download(
@ -71,11 +81,70 @@ def get_model(
                revision=medusa_revision,
                filename="medusa_lm_head.safetensors",
            )
-            use_medusa = Path(medusa_config).parent
+            speculator = {
                "path": Path(medusa_config).parent,
                "model_paths": ["medusa_lm_head.safetensors"],
            }
        else:
-            use_medusa = Path(medusa_model_id)
+            speculator = {
                "path": Path(medusa_model_id),
                "model_paths": ["medusa_lm_head.safetensors"],
            }
        method = "medusa"
    elif model_type == "mlp_speculator":
        mlp_model_id = model_id
        mlp_revision = revision
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        speculate_mlp = config_dict["n_predict"]
        if speculate is not None:
            if speculate > speculate_mlp:
                raise RuntimeError(
                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
                )
            else:
                set_speculate(speculate)
        else:
            set_speculate(speculate_mlp)
        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        # Reload model type from parent.
        model_type = config_dict.get("model_type", None)
        is_local = Path(mlp_model_id).exists()
        extension = ".safetensors"
        if not is_local:
            mlp_speculator_config = hf_hub_download(
                mlp_model_id, revision=mlp_revision, filename="config.json"
            )
            api = HfApi()
            info = api.model_info(mlp_model_id, revision=mlp_revision)
            filenames = [
                s.rfilename
                for s in info.siblings
                if s.rfilename.endswith(extension)
                and len(s.rfilename.split("/")) == 1
                and "arguments" not in s.rfilename
                and "args" not in s.rfilename
                and "training" not in s.rfilename
            ]
            for filename in filenames:
                hf_hub_download(
                    mlp_model_id,
                    revision=mlp_revision,
                    filename=filename,
                )
            speculator = {
                "path": Path(mlp_speculator_config).parent,
                "model_paths": filenames,
            }
        else:
            speculator = Path(mlp_model_id)
            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
            speculator = {"path": speculator, "model_paths": filenames}
        method = "mlp_speculator"
    else:
        method = "n-gram"
@ -92,7 +161,18 @@ def get_model(
        return BLOOM(
            model_id,
            revision,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
    if model_type == "llava_next":
        return VlmCausalLM(
            model_class=LlavaNextForConditionalGeneration,
            model_id=model_id,
            revision=revision,
            quantize=None,
            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@ -101,7 +181,7 @@ def get_model(
        return CausalLM(
            model_id,
            revision,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@ -35,14 +35,14 @@ class BLOOM(CausalLM):
        self,
        model_id: str,
        revision: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        super(BLOOM, self).__init__(
            model_id=model_id,
            revision=revision,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@ -2,7 +2,7 @@ import math
 import torch
 from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
 BLOCK_SIZE: int = 16
 # Will be set in warmup
@ -25,7 +25,7 @@ class CacheManager:
        self.repeat_slots = repeat_slots
        element_size = torch.tensor([], dtype=dtype).element_size()
-        if IS_XPU_SYSTEM:
+        if SYSTEM == "xpu":
            x = 1
        else:
            x = self.block_size // element_size
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -367,6 +367,7 @@ class CausalLMBatch(Batch):
        input_lengths = [b.input_length for b in batches]
        max_input_length = max(input_lengths)
        offsets = [max_input_length - b.input_length for b in batches]
        cur_padding = [b.right_padding for b in batches]
        # For prefill there is a space allocated only for first token
        # Need to add padding to the max total tokens before first decode
@ -596,13 +597,15 @@ class CausalLM(Model):
        self,
        model_id: str,
        revision: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
        if speculator:
            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
        self.prev_bs = 0
        if use_medusa:
            raise RuntimeError("Medusa decoding is not enabled for AutoModel")
        # Create tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
 )
 from transformers import BloomConfig, PreTrainedModel
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
--- a/server/text_generation_server/models/custom_modeling/clip.py
+++ b/server/text_generation_server/models/custom_modeling/clip.py
@ -15,7 +15,7 @@ from transformers.modeling_outputs import (
 )
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@ -26,18 +26,22 @@ from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
-if IS_CUDA_SYSTEM:
+if SYSTEM == "cuda":
    import dropout_layer_norm
 else:
    dropout_layer_norm = None
@ -52,7 +56,7 @@ class CohereRotary(PositionRotaryEmbedding):
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
            import rotary_emb
            q1 = query[..., ::2]
@ -64,8 +68,8 @@ class CohereRotary(PositionRotaryEmbedding):
            k2 = key[..., 1::2]
            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif IS_ROCM_SYSTEM:
+        elif SYSTEM == "rocm":
-            from vllm import pos_encoding_ops
+            from vllm._C import ops
            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
@ -73,7 +77,7 @@ class CohereRotary(PositionRotaryEmbedding):
            head_size = query.shape[-1]
            # Inplace operation, updating query and key.
-            pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, False)
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
@ -90,7 +94,7 @@ class CohereLayerNorm(nn.Module):
        self.eps = eps
    def forward(self, hidden_states):
-        if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
+        if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
            hidden_states = hidden_states.reshape(
                -1, self.weight.shape[0], self.weight.shape[1]
            )
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@ -21,21 +21,26 @@ from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
 from loguru import logger
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
-if not IS_XPU_SYSTEM:
+if SYSTEM != "xpu":
    from vllm.model_executor.layers.fused_moe import fused_moe
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    FastLinear,
    FastLayerNorm,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
 from text_generation_server.utils.log import log_once
@ -216,7 +221,7 @@ def _load_gqa(config, prefix: str, weights):
        bits, groupsize, desc_act, quant_method = weights._get_gptq_params()
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+        from text_generation_server.layers import HAS_EXLLAMA
        use_exllama = (
            bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act
@ -236,7 +241,7 @@ def _load_gqa(config, prefix: str, weights):
            log_once(
                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
            )
-            from text_generation_server.utils.awq.conversion_utils import (
+            from text_generation_server.layers.awq.conveersion_utils import (
                fast_awq_to_gptq,
            )
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@ -27,13 +27,15 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
@ -97,8 +99,13 @@ class GemmaConfig(PretrainedConfig):
 class GemmaFastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix, weights, eps=1e-6):
        dtype = weights.dtype
        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
-        return cls(weight, eps)
+        weights.dtype = dtype
        new = cls(weight, eps)
        new.dtype = dtype
        return new
    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
@ -109,7 +116,7 @@ class GemmaFastRMSNorm(FastRMSNorm):
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
-        return hidden_states.to(self.weight.dtype), residual
+        return hidden_states.to(self.dtype), residual
 def load_attention(config, prefix, weights):
@ -151,15 +158,11 @@ def _load_gqa(config, prefix: str, weights):
 class FlashGemmaAttention(torch.nn.Module):
-    def __init__(
+    def __init__(self, prefix: str, config, weights, causal: bool):
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
        self.causal = causal
        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
@ -236,6 +239,7 @@ class FlashGemmaAttention(torch.nn.Module):
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
                causal=self.causal,
            )
        # Decode
        else:
@ -293,11 +297,10 @@ class GemmaMLP(nn.Module):
 class FlashGemmaLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix, config, weights, causal: bool):
        super().__init__()
        prefix = f"model.layers.{layer_id}"
        self.self_attn = FlashGemmaAttention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
        )
        self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
@ -349,30 +352,25 @@ class FlashGemmaLayer(nn.Module):
 class FlashGemmaModel(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights, causal: bool):
        super().__init__()
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        embed_norm = config.hidden_size**0.5
        self.embed_tokens = TensorParallelEmbedding(
            prefix="model.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm
        self.layers = nn.ModuleList(
            [
                FlashGemmaLayer(
-                    layer_id,
+                    prefix=f"{prefix}.layers.{layer_id}",
-                    config,
+                    config=config,
-                    weights,
+                    weights=weights,
                    causal=causal,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = GemmaFastRMSNorm.load(
-            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )
        self.gradient_checkpointing = False
@ -383,7 +381,7 @@ class FlashGemmaModel(torch.nn.Module):
    def forward(
        self,
-        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
@ -392,7 +390,7 @@ class FlashGemmaModel(torch.nn.Module):
        input_lengths: torch.Tensor,
        max_s: int,
    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
@ -421,13 +419,30 @@ class FlashGemmaModel(torch.nn.Module):
 class FlashGemmaForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights, causal: bool):
        super().__init__()
-        self.model = FlashGemmaModel(config, weights)
+        embed_norm = config.hidden_size**0.5
        if prefix is None:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"
        self.embed_tokens = TensorParallelEmbedding(
            prefix=f"{prefix}.embed_tokens", weights=weights
        )
        self.embed_tokens.weight *= embed_norm
        self.model = FlashGemmaModel(
            prefix=prefix, config=config, weights=weights, causal=causal
        )
        self.lm_head = SpeculativeHead.load(
-            config,
+            prefix=(
-            prefix="model.embed_tokens" if config.tie_word_embeddings else "lm_head",
+                f"{prefix}.embed_tokens"
                if config.tie_word_embeddings
                else f"{prefix}.lm_head"
            ),
            config=config,
            weights=weights,
        )
@ -443,8 +458,9 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        max_s: int,
        lm_head_indices: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        input_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
-            input_ids,
+            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@ -0,0 +1,454 @@
 # coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    get_linear,
 )
 def load_qkv(config, prefix: str, weights, head_size, num_heads):
    if config.quantize == "gptq":
        return _load_qkv_gptq(
            config,
            prefix,
            weights,
        )
    else:
        return _load_qkv(config, prefix, weights, head_size, num_heads)
 def _load_qkv_gptq(config, prefix: str, weights):
    world_size = weights.process_group.size()
    rank = weights.process_group.rank()
    # Weights
    weight = weights.get_weights_col_packed_qkv(f"{prefix}.c_attn", config.quantize)
    # Bias
    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
    shape = slice_.get_shape()
    total_size = shape[0]
    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
    single_size = total_size // 3
    assert single_size % world_size == 0
    block_size = single_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size
    tensors = []
    for i in range(3):
        tensor = slice_[start + i * single_size : stop + i * single_size]
        tensors.append(tensor)
    bias = torch.cat(tensors, dim=0)
    bias = bias.to(device=weights.device)
    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
 def _load_qkv(config, prefix: str, weights, head_size, num_heads):
    """Load QKV from a single, transposed matrix."""
    slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
    shape = slice_.get_shape()
    total_size = shape[1]
    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
    world_size = weights.process_group.size()
    single_size = total_size // 3
    assert single_size % world_size == 0
    rank = weights.process_group.rank()
    # Weights
    block_size = single_size // world_size
    start = rank * block_size
    stop = (rank + 1) * block_size
    tensors = []
    for i in range(3):
        tensor = slice_[:, start + i * single_size : stop + i * single_size]
        tensors.append(tensor)
    weight = torch.cat(tensors, dim=1).T
    weight = weight.to(dtype=weights.dtype)
    weight = weight.to(device=weights.device)
    # Bias
    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
    shape = slice_.get_shape()
    total_size = shape[0]
    single_size = total_size // 3
    block_size = single_size // world_size
    assert single_size % world_size == 0
    start = rank * block_size
    stop = (rank + 1) * block_size
    b = []
    for i in range(3):
        tensor = slice_[start + i * single_size : stop + i * single_size]
        b.append(tensor)
    bias = torch.cat(b, dim=0)
    bias = bias.to(dtype=weights.dtype)
    bias = bias.to(device=weights.device)
    assert list(bias.shape) == [
        3 * num_heads * head_size
    ], f"{weight.shape} != {[3 * num_heads * head_size]}"
    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
 def load_row(config, prefix: str, weights, bias: bool):
    """load_row, but with transposed weight matrices."""
    if config.quantize == "gptq":
        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
    else:
        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
    if bias and weights.process_group.rank() == 0:
        # Rank is only on the first rank process
        bias = weights.get_tensor(f"{prefix}.bias")
    else:
        bias = None
    return TensorParallelRowLinear(
        get_linear(weight, bias, config.quantize), process_group=weights.process_group
    )
 def load_col(config, prefix: str, weights, bias: bool):
    """load_col, but with transposed weight matrices."""
    if config.quantize == "gptq":
        weight = weights.get_multi_weights_col(
            [prefix], quantize=config.quantize, dim=1
        )
    else:
        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
    if bias:
        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
    else:
        bias = None
    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
 class FlashGPT2Attention(torch.nn.Module):
    def __init__(
        self,
        prefix: str,
        config,
        weights,
    ):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.softmax_scale = self.head_size**-0.5
        if self.num_heads % weights.process_group.size() != 0:
            raise ValueError(
                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                f"and `num_shards`: {weights.process_group.size()}"
            )
        self.num_heads = self.num_heads // weights.process_group.size()
        self.query_key_value = load_qkv(
            config,
            prefix=prefix,
            weights=weights,
            head_size=self.head_size,
            num_heads=self.num_heads,
        )
        self.o_proj = load_row(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=True,
        )
        self.kv_head_mapping = torch.arange(
            0, self.num_heads, dtype=torch.int32, device=weights.device
        )
    def forward(
        self,
        hidden_states,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        input_lengths,
        max_s,
    ):
        query, key, value = self.query_key_value(hidden_states).split(
            self.head_size * self.num_heads, dim=1
        )
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_heads, self.head_size)
        value = value.view(-1, self.num_heads, self.head_size)
        paged_attention.reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
        # output tensor
        attn_output = torch.empty_like(query)
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
            flash_attn.attention(
                query,
                key,
                value,
                attn_output,
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
            )
        # Decode
        else:
            paged_attention.attention(
                attn_output,
                query,
                kv_cache[0],
                kv_cache[1],
                self.kv_head_mapping,
                self.softmax_scale,
                block_tables,
                input_lengths,
                max_s,
            )
        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
 class GPT2MLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        act = config.activation_function
        self.act = (
            ACT2FN[act]
            if "gelu" not in act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
                ),
            )
        )
        self.c_fc = load_col(
            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
        )
        self.c_proj = load_row(
            config,
            prefix=f"{prefix}.c_proj",
            weights=weights,
            bias=True,
        )
        intermediate_size = (
            config.n_inner if config.n_inner is not None else 4 * config.hidden_size
        )
        self.intermediate_size = intermediate_size // weights.process_group.size()
    def forward(self, hidden_states):
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        return self.c_proj(hidden_states)
 class FlashGPT2Layer(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.self_attn = FlashGPT2Attention(
            prefix=f"{prefix}.attn", config=config, weights=weights
        )
        self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
        self.input_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
        )
        self.post_attention_layernorm = nn.LayerNorm.load(
            prefix=f"{prefix}.ln_2",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
    def forward(
        self,
        hidden_states,
        residual,
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
        slots,
        input_lengths,
        max_s,
    ):
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        attn_output = self.self_attn(
            hidden_states,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
        )
        hidden_states = attn_output + residual
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        mlp_output = self.mlp(hidden_states)
        return residual + mlp_output, residual
 class FlashGPT2Model(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
        self.layers = nn.ModuleList(
            [
                FlashGPT2Layer(
                    prefix=(
                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
                    ),
                    config=config,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = nn.LayerNorm.load(
            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.gradient_checkpointing = False
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
    def forward(
        self,
        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
        true_max_s: int,
        prefill_cache_indices: Optional[torch.Tensor],
    ) -> torch.Tensor:
        hidden_states = inputs_embeds
        residual = None
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
                residual,
                cu_seqlen_prefill,
                kv_cache[i],
                block_tables,
                slots,
                input_lengths,
                max_s,
            )
        hidden_states = self.norm(hidden_states)
        return hidden_states
 class FlashGPT2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        self.embed_tokens = TensorParallelEmbedding(
            prefix=("wte" if not prefix else f"{prefix}.wte"),
            weights=weights,
        )
        self.embed_positions = TensorParallelEmbedding(
            prefix=("wpe" if not prefix else f"{prefix}.wpe"),
            weights=weights,
        )
        self.model = FlashGPT2Model(prefix, config, weights)
        self.lm_head = SpeculativeHead.load(
            config,
            prefix="wte" if not prefix else f"{prefix}.wte",
            weights=weights,
        )
    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        token_embeds = self.embed_tokens(input_ids)
        position_embeds = self.embed_positions(position_ids)
        inputs_embeds = token_embeds + position_embeds
        hidden_states = self.model(
            inputs_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,
            block_tables,
            slots,
            input_lengths,
            max_s,
            true_max_s=max_s,
            prefill_cache_indices=prefill_cache_indices,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.lm_head(hidden_states)
        return logits, speculative_logits
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -18,43 +18,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Optional, Tuple
 import torch
 import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
-    get_linear,
+)
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
 if SYSTEM == "rocm":
    try:
        from vllm import _custom_C
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
 def load_attention(config, prefix, weights):
    bias = config.attention_bias
    if config.num_attention_heads != config.num_key_value_heads:
-        return _load_gqa(config, prefix, weights)
+        return TensorParallelColumnLinear.load_multi(
            config,
            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
            dim=0,
            weights=weights,
            bias=bias,
        )
    else:
        if config.model_type == "baichuan":
            return TensorParallelColumnLinear.load_qkv(
                config,
                prefix=f"{prefix}.W_pack",
                weights=weights,
-                bias=False,
+                bias=bias,
            )
        elif config.model_type == "phi3":
            return TensorParallelColumnLinear.load_qkv(
                config,
                prefix=f"{prefix}.qkv_proj",
                weights=weights,
-                bias=False,
+                bias=bias,
            )
        else:
            return TensorParallelColumnLinear.load_multi(
@ -62,36 +78,10 @@ def load_attention(config, prefix, weights):
                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
                dim=0,
                weights=weights,
-                bias=False,
+                bias=bias,
            )
 def _load_gqa(config, prefix: str, weights):
    assert config.hidden_size % config.num_attention_heads == 0
    assert config.num_attention_heads % weights.process_group.size() == 0
    weight = weights.get_multi_weights_col(
        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
        quantize=config.quantize,
        dim=0,
    )
    if config.quantize not in ["gptq", "awq"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
        head_size = config.hidden_size // config.num_attention_heads
        num_heads = config.num_attention_heads // weights.process_group.size()
        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
        assert list(weight.shape) == [
            (num_heads + 2 * num_key_value_heads) * head_size,
            config.hidden_size,
        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
    return TensorParallelColumnLinear(
        get_linear(weight, bias=None, quantize=config.quantize)
    )
 class FlashLlamaAttention(torch.nn.Module):
    def __init__(
        self,
@ -200,24 +190,27 @@ class FlashLlamaAttention(torch.nn.Module):
 class LlamaMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
-        act = config.hidden_act
+        self.hidden_act = config.hidden_act
        self.act = (
-            ACT2FN[act]
+            ACT2FN[self.hidden_act]
-            if "gelu" not in act
+            if "gelu" not in self.hidden_act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
-                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                    "tanh"
                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        )
        # Fuse gate and up proj
        bias = getattr(config, "mlp_bias", False)
        if config.model_type == "phi3":
            self.gate_up_proj = TensorParallelColumnLinear.load_gate_up(
                config,
                prefix=f"{prefix}.gate_up_proj",
                weights=weights,
-                bias=False,
+                bias=bias,
            )
        else:
            self.gate_up_proj = TensorParallelColumnLinear.load_multi(
@ -225,22 +218,40 @@ class LlamaMLP(nn.Module):
                prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
                weights=weights,
                dim=0,
-                bias=False,
+                bias=bias,
            )
        self.down_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.down_proj",
            weights=weights,
-            bias=False,
+            bias=bias,
        )
        self.intermediate_size = (
            config.intermediate_size // weights.process_group.size()
        )
        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize
    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
+        if (
-        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            SYSTEM == "rocm"
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+            and self.hidden_act == "silu"
            and hidden_states.shape[0] == 1
            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],
                self.intermediate_size,
                dtype=hidden_states.dtype,
                device="cuda",
            )
            _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
            return self.down_proj(out)
        else:
            gate_up_states = self.gate_up_proj(hidden_states)
            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
            return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
 class FlashLlamaLayer(nn.Module):
@ -383,9 +394,14 @@ class FlashLlamaForCausalLM(torch.nn.Module):
            weights=weights,
        )
        self.model = FlashLlamaModel(prefix, config, weights)
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"
        self.lm_head = SpeculativeHead.load(
            config,
-            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            prefix=suffix if not prefix else f"{prefix}.suffix",
            weights=weights,
        )
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -26,18 +26,28 @@ from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
 if SYSTEM == "rocm":
    try:
        from vllm import _custom_C
    except Exception as e:
        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
 class MistralConfig(PretrainedConfig):
    model_type = "mistral"
@ -249,14 +259,16 @@ class MistralAttention(torch.nn.Module):
 class MistralMLP(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
-        act = config.hidden_act
+        self.hidden_act = config.hidden_act
        self.act = (
-            ACT2FN[act]
+            ACT2FN[self.hidden_act]
-            if "gelu" not in act
+            if "gelu" not in self.hidden_act
            else lambda x: torch.nn.functional.gelu(
                x,
                approximate=(
-                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                    "tanh"
                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
                    else "none"
                ),
            )
        )
@ -278,10 +290,28 @@ class MistralMLP(nn.Module):
            config.intermediate_size // weights.process_group.size()
        )
        # TODO: This is a hotfix to be removed & properly refactored.
        self.quantize = config.quantize
    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
+        if (
-        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            SYSTEM == "rocm"
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+            and self.hidden_act == "silu"
            and hidden_states.shape[0] == 1
            and not self.quantize
        ):
            out = torch.empty(
                hidden_states.shape[0],
                self.intermediate_size,
                dtype=hidden_states.dtype,
                device="cuda",
            )
            _custom_C.LLMM_Silu(self.gate_up_proj.linear.weight, hidden_states, out, 8)
            return self.down_proj(out)
        else:
            gate_up_states = self.gate_up_proj(hidden_states)
            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
            return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
 class MistralLayer(nn.Module):
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@ -24,9 +24,9 @@ import torch.distributed
 import numpy as np
 from torch import nn
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM
-if not IS_XPU_SYSTEM:
+if SYSTEM != "xpu":
    from vllm.model_executor.layers.fused_moe import fused_moe
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
@ -34,16 +34,20 @@ from typing import Optional, List, Tuple
 from loguru import logger
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    FastLinear,
    FastRMSNorm,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
 class MixtralConfig(PretrainedConfig):
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@ -29,15 +29,19 @@ from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    SpeculativeHead,
    FastLayerNorm,
    PositionRotaryEmbedding,
    get_linear,
 )
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
 def load_row(config, prefix: str, weights, bias: bool):
--- a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@ -0,0 +1,110 @@
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.distributed
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
 from text_generation_server.models.custom_modeling.vlm import (
    load_text_model,
    load_vision_model,
 )
 class PaliGemmaForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = config.quantize
        self.vision_tower = load_vision_model(
            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
            config=config.vision_config,
            weights=weights,
        )
        self.multi_modal_projector = TensorParallelColumnLinear.load(
            config,
            prefix="multi_modal_projector.linear",
            weights=weights,
            bias=True,
        )
        self.vocab_size = config.vocab_size
        self.config = config
        text_config = config.text_config
        text_config.speculator = config.speculator
        text_config.quantize = config.quantize
        self.text_model = load_text_model(
            prefix="language_model" if not prefix else f"{prefix}.language_model",
            config=config.text_config,
            weights=weights,
        )
        self.pad_token_id = (
            config.pad_token_id if config.pad_token_id is not None else -1
        )
    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
        block_tables: torch.Tensor,
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
        prefill_cache_indices: Optional[torch.Tensor] = None,
        lm_head_indices: Optional[torch.Tensor] = None,
        pixel_values: torch.FloatTensor = None,
        # Unused here
        pixel_attention_mask: Optional[torch.BoolTensor] = None,
        image_sizes: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        inputs_embeds = self.text_model.embed_tokens(input_ids)
        # TODO This is odd but apparently pali gemma position ids start at 1.
        if cu_seqlen_prefill is not None:
            max_s += 1
            position_ids += 1
        if pixel_values is not None:
            pixel_values = pixel_values.to(dtype=inputs_embeds.dtype)
            image_outputs = self.vision_tower(pixel_values)
            image_features = self.multi_modal_projector(image_outputs.last_hidden_state)
            # mask where image or padding tokens
            mask = input_ids == self.config.image_token_index
            # insert image features into input embeddings
            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
            kv_cache=kv_cache,
            block_tables=block_tables,
            slots=slots,
            input_lengths=input_lengths,
            max_s=max_s,
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@ -7,15 +7,19 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
 class PhiConfig(PretrainedConfig):
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -6,13 +6,15 @@ from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@ -1,22 +1,21 @@
 from typing import List, Optional, Tuple
 import torch
 import torch.distributed
 from torch import nn
 from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
-from typing import Optional, List, Tuple
+from transformers.modeling_utils import PreTrainedModel
-from text_generation_server.utils import paged_attention, flash_attn
+from text_generation_server.layers import (
-from text_generation_server.utils.flash_attn import attention
+    SpeculativeHead,
 from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    SpeculativeHead,
+    TensorParallelRowLinear,
    FastLayerNorm,
    PositionRotaryEmbedding,
    get_linear,
 )
 from text_generation_server.layers.layernorm import FastLayerNorm
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.utils import flash_attn, paged_attention
 def load_row(config, prefix: str, weights, bias: bool):
@ -48,6 +47,7 @@ class RWConfig(PretrainedConfig):
        hidden_size=64,
        num_hidden_layers=None,
        num_attention_heads=None,
        num_ln_in_prallel_attention=None,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        use_cache=True,
@ -61,6 +61,7 @@ class RWConfig(PretrainedConfig):
        new_decoder_architecture=None,
        bias=False,
        parallel_attn=False,
        rope_theta=10_000.0,
        **kwargs,
    ):
        if alibi:
@ -71,6 +72,7 @@ class RWConfig(PretrainedConfig):
        self.model_type = model_type
        self.alibi = False
        self.rotary = True
        self.rope_theta = rope_theta
        self.vocab_size = vocab_size
        # Backward compatibility with n_embed kwarg
@ -87,6 +89,7 @@ class RWConfig(PretrainedConfig):
            else kwargs.pop("n_head", 8)
        )
        self.layer_norm_epsilon = layer_norm_epsilon
        self.num_ln_in_parallel_attention = num_ln_in_prallel_attention
        self.initializer_range = initializer_range
        self.use_cache = use_cache
        self.hidden_dropout = hidden_dropout
@ -128,9 +131,13 @@ class FlashRWAttention(torch.nn.Module):
        self.num_heads_kv = config.n_head_kv
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.num_heads
        self.rope_theta = config.rope_theta
        self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=10000.0, device=weights.device
+            config=config,
            dim=self.head_size,
            base=self.rope_theta,
            device=weights.device,
        )
        self.softmax_scale = self.head_size ** (-0.5)
@ -240,9 +247,13 @@ class FlashRWLargeAttention(torch.nn.Module):
        self.hidden_size = hidden_size
        self.head_size = hidden_size // num_heads
        self.num_groups = num_groups
        self.rope_theta = config.rope_theta
        self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=10000.0, device=weights.device
+            config=config,
            dim=self.head_size,
            base=self.rope_theta,
            device=weights.device,
        )
        self.softmax_scale = self.head_size ** (-0.5)
@ -253,7 +264,7 @@ class FlashRWLargeAttention(torch.nn.Module):
        if process_group.size() > self.num_groups:
            raise NotImplementedError(
-                f"Tensor Parallelism is not implemented for world_size > n groups"
+                "Tensor Parallelism is not implemented for world_size > n groups"
            )
        if self.num_groups % process_group.size() != 0:
            raise NotImplementedError(
@ -455,29 +466,61 @@ class FlashRWLayer(nn.Module):
                max_s,
            )
-            hidden_states, residual = self.post_attention_layernorm(
+            if self.post_attention_layernorm is not None:
-                hidden_states, residual
+                hidden_states, residual = self.post_attention_layernorm(
-            )
+                    hidden_states, residual
                )
            mlp_output = self.mlp(hidden_states)
            return mlp_output, residual
 class FlashRWLayerNorm(nn.Module):
    def __init__(self, config, prefix, weights):
        super().__init__()
        self.num_ln = config.num_ln_in_parallel_attn
        if self.num_ln == 1:
            self.input_ln = FastLayerNorm.load(
                prefix=f"{prefix}.input_layernorm",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
        elif self.num_ln == 2:
            self.ln_attn = FastLayerNorm.load(
                prefix=f"{prefix}.ln_attn",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
            self.ln_mlp = FastLayerNorm.load(
                prefix=f"{prefix}.ln_mlp",
                weights=weights,
                eps=config.layer_norm_epsilon,
            )
        else:
            raise ValueError("Number of layer norms can either be 1 or 2.")
    def forward(
        self,
        hidden_states,
        residual,
    ):
        if self.num_ln == 1:
            ln_hidden_states, residual = self.input_ln(hidden_states, residual)
            return ln_hidden_states, ln_hidden_states, residual
        elif self.num_ln == 2:
            ln_attn, residual = self.ln_attn(hidden_states, residual)
            ln_mlp, _ = self.ln_mlp(residual)
            return ln_attn, ln_mlp, residual
 class FlashRWLargeLayer(nn.Module):
    def __init__(self, layer_id, config, weights):
        super().__init__()
        prefix = f"transformer.h.{layer_id}"
-        self.ln_attn = FastLayerNorm.load(
+
-            prefix=f"{prefix}.ln_attn",
+        self.ln_layer = FlashRWLayerNorm(config, prefix, weights)
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.ln_mlp = FastLayerNorm.load(
            prefix=f"{prefix}.ln_mlp",
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
        self.self_attention = FlashRWLargeAttention(
            config,
@ -503,8 +546,8 @@ class FlashRWLargeLayer(nn.Module):
        input_lengths,
        max_s,
    ):
-        ln_attn, residual = self.ln_attn(hidden_states, residual)
+        # Layer norm.
-        ln_mlp, _ = self.ln_mlp(residual)
+        ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)
        # Self attention.
        attn_output = self.self_attention(
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@ -6,14 +6,16 @@ from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    SpeculativeHead,
    TensorParallelEmbedding,
    FastLayerNorm,
    get_linear,
 )
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
 def load_multi_mqa(
@ -80,13 +82,13 @@ def _load_multi_mqa_gptq(
            g_idx = g_idx.to(device=weights.device)
        elif quant_method == "awq":
            g_idx = None
-            from text_generation_server.utils.awq.conversion_utils import (
+            from text_generation_server.layers.awq.conversion_utils import (
                fast_awq_to_gptq,
            )
            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
        use_exllama = HAS_EXLLAMA
        weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@ -27,15 +27,19 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
-    FastRMSNorm,
+)
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
    FastRMSNorm,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@ -29,7 +29,7 @@ from text_generation_server.models.custom_modeling.vlm import (
 )
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
@ -683,9 +683,9 @@ class Idefics2ForConditionalGeneration(nn.Module):
    def __init__(self, prefix, config, weights):
        super().__init__()
        config.vision_config.quantize = config.quantize
-        config.vision_config.use_medusa = config.use_medusa
+        config.vision_config.speculator = config.speculator
        config.text_config.quantize = config.quantize
-        config.text_config.use_medusa = config.use_medusa
+        config.text_config.speculator = config.speculator
        vision_config = config.vision_config
        self.text_model = load_text_model(
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,3 @@`
							`# Using TGI with Intel Gaudi`

							`Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index).`
		`@ -0,0 +1,3 @@`
							`# Using TGI with Inferentia`

							`Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.`