Add some missing modification of 2.3.0 because of conflict

Signed-off-by: yuanwu <yuan.wu@intel.com>
2025-06-30 21:10:16 +00:00 · 2024-09-25 07:49:49 +00:00 · 2024-09-25 07:49:49 +00:00 · 14fdc4ae5e
commit 14fdc4ae5e
parent 514a5a737d
26 changed files with 2946 additions and 1094 deletions
--- a/.devcontainer/Dockerfile.trtllm
+++ b/.devcontainer/Dockerfile.trtllm
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@ -0,0 +1,45 @@
+name: Automatic Documentation for Launcher
+
+on:
+  pull_request:
+
+jobs:
+  update_docs:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        profile: minimal
+        toolchain: stable
+
+    - name: Install Protocol Buffers compiler
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y protobuf-compiler libprotobuf-dev
+
+    - name: Install Launcher
+      id: install-launcher
+      run: cargo install --path launcher/
+
+    - name: Install router
+      id: install-router
+      run: cargo install --path backends/v3/
+
+    - uses: actions/setup-node@v4
+      with:
+        node-version: 22
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Check that documentation is up-to-date
+      run: |
+        npm install -g @redocly/cli
+        python update_doc.py --check
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -0,0 +1,191 @@
+name: Build and push docker image to internal registry
+
+on:
+  workflow_call:
+    inputs:
+      hardware:
+        type: string
+        description: Hardware
+          # options:
+          # - cuda
+          # - rocm
+          # - intel
+        required: true
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean
+
+jobs:
+  build-and-push:
+    outputs:
+      docker_image: ${{ steps.final.outputs.docker_image }}
+      docker_devices: ${{ steps.final.outputs.docker_devices }}
+      runs_on: ${{ steps.final.outputs.runs_on }}
+      label: ${{ steps.final.outputs.label }}
+    concurrency:
+      group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    permissions:
+      contents: write
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Construct harware variables
+        shell: bash
+        run: |
+          case ${{ inputs.hardware }} in
+            cuda)
+                export dockerfile="Dockerfile"
+                export label_extension=""
+                export docker_devices=""
+                export runs_on="aws-g6-12xl-plus-priv-cache"
+                export platform=""
+                ;;
+            rocm)
+                export dockerfile="Dockerfile_amd"
+                export label_extension="-rocm"
+                export docker_devices="/dev/kfd,/dev/dri"
+                # TODO Re-enable when they pass.
+                # export runs_on="amd-gpu-tgi"
+                export runs_on="ubuntu-latest"
+                export platform=""
+                ;;
+            intel-xpu)
+                export dockerfile="Dockerfile_intel"
+                export label_extension="-intel-xpu"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform="xpu"
+                ;;
+            intel-cpu)
+                export dockerfile="Dockerfile_intel"
+                export label_extension="-intel-cpu"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform="cpu"
+                ;;
+          esac
+          echo $dockerfile
+          echo "Dockerfile=${dockerfile}"
+          echo $label_extension
+          echo $docker_devices
+          echo $runs_on
+          echo $platform
+          echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
+          echo "LABEL=${label_extension}" >> $GITHUB_ENV
+          echo "PLATFORM=${platform}" >> $GITHUB_ENV
+          echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
+          echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
+          echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
+      - name: Initialize Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          install: true
+          buildkitd-config: /tmp/buildkitd.toml
+      - name: Login to internal Container Registry
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.REGISTRY_USERNAME }}
+          password: ${{ secrets.REGISTRY_PASSWORD }}
+          registry: registry.internal.huggingface.tech
+      - name: Login to GitHub Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to Azure Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.AZURE_DOCKER_USERNAME }}
+          password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
+          registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
+      # If pull request
+      - name: Extract metadata (tags, labels) for Docker
+        if: ${{ github.event_name == 'pull_request' }}
+        id: meta-pr
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+          tags: |
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+      # If main, release or tag
+      - name: Extract metadata (tags, labels) for Docker
+        if: ${{ github.event_name != 'pull_request' }}
+        id: meta
+        uses: docker/metadata-action@v4.3.0
+        with:
+          flavor: |
+            latest=auto
+          images: |
+            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            ghcr.io/huggingface/text-generation-inference
+            db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
+          tags: |
+            type=semver,pattern={{version}}${{ env.LABEL }}
+            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
+            type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ${{ env.DOCKERFILE }}
+          push: true
+          platforms: 'linux/amd64'
+          build-args: |
+            GIT_SHA=${{ env.GITHUB_SHA }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            PLATFORM=${{ env.PLATFORM }}
+          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+      - name: Final
+        id: final
+        run: |
+          echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
+          echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
+          echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+  integration_tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    needs: build-and-push
+    runs-on:
+      group: ${{ needs.build-and-push.outputs.runs_on }}
+    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
+    env:
+      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_VOLUME=/mnt/cache
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          echo $DOCKER_IMAGE
+          pytest -s -vv integration-tests ${PYTEST_FLAGS}
--- a/.github/workflows/build_documentation.yaml
+++ b/.github/workflows/build_documentation.yaml
@ -0,0 +1,20 @@
+name: Build documentation
+
+on:
+  push:
+    paths:
+      - "docs/source/**"
+    branches:
+      - main
+      - doc-builder*
+      - v*-release
+
+jobs:
+   build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: text-generation-inference
+      additional_args: --not_python_module
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yaml
+++ b/.github/workflows/build_pr_documentation.yaml
@ -0,0 +1,19 @@
+name: Build PR Documentation
+
+on:
+  pull_request:
+    paths:
+      - "docs/source/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: text-generation-inference
+      additional_args: --not_python_module
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@ -0,0 +1,26 @@
+name: Python Client Tests
+
+on:
+  pull_request:
+    paths:
+      - ".github/workflows/client-tests.yaml"
+      - "clients/python/**"
+
+jobs:
+  run_tests:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.9
+      - name: Install
+        run: |
+          cd clients/python && pip install .
+      - name: Run tests
+        run: |
+          pip install pytest pytest-asyncio
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          make python-client-tests
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -0,0 +1,43 @@
+name: Nightly load test
+
+on:
+  schedule:
+    - cron: '0 0 * * 1-5'
+
+  pull_request:
+    paths:
+      - ".github/workflows/load_test.yaml"
+    branches:
+      - 'main'
+
+jobs:
+  load-tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on:
+      group: aws-g5-12xlarge
+    env:
+      DOCKER_VOLUME: /cache
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install k6
+        run: |
+          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
+
+      - name: Start starcoder
+        run: |
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          sleep 10
+          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
+
+      - name: Run k6
+        run: |
+          ./k6 run load_tests/starcoder_load.js
+
+      - name: Stop starcoder
+        if: ${{ always() }}
+        run: |
+          docker stop tgi-starcoder || true
--- a/.github/workflows/push_docker_image.yml
+++ b/.github/workflows/push_docker_image.yml
@ -1,56 +0,0 @@
-name: Build and push docker image to Github registry
-
-on:
-  workflow_dispatch:
-    inputs:
-      tag:
-        description: 'Tag for the Docker image:'
-        required: true
-
-jobs:
-  build-and-push:
-    concurrency:
-      group: ${{ github.workflow }}
-      cancel-in-progress: true
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      packages: write
-      # This is used to complete the identity challenge
-      # with sigstore/fulcio when running outside of PRs.
-      id-token: write
-      security-events: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Initialize Docker Buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          install: true
-          config-inline: |
-            [registry."docker.io"]
-      - name: Login to GitHub Container Registry
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Extract metadata (tags, labels) for Docker
-        id: meta
-        uses: docker/metadata-action@v4.3.0
-        with:
-          flavor: |
-            latest=true
-          images: ghcr.io/huggingface/tgi-gaudi
-          tags: |
-            type=raw,value=${{ github.event.inputs.tag }}
-      - name: Build and push Docker image
-        id: build-and-push
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: Dockerfile
-          push: true
-          platforms: 'linux/amd64'
-          tags: ${{ steps.meta.outputs.tags }}
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -0,0 +1,60 @@
+name: Server Tests
+
+on:
+  pull_request:
+    paths:
+      - ".github/workflows/tests.yaml"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        id: python
+        with:
+          python-version: 3.11
+      - name: Install Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          # Released on: 02 May, 2024
+          # https://releases.rs/docs/1.78.0/
+          toolchain: 1.80.0
+          override: true
+          components: rustfmt, clippy
+      - name: Install Protoc
+        uses: arduino/setup-protoc@v1
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
+      - name: Install
+        run: |
+          sudo apt update
+          sudo apt install python3.11-dev -y
+          make install-cpu
+      - name: Run server tests
+        run: |
+          pip install pytest
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          pytest -s -vv server/tests
+      - name: Pre-commit checks
+        run: |
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
+      - name: Run Rust tests
+        run: |
+          cargo test
--- a/.github/workflows/upload_pr_documentation.yaml
+++ b/.github/workflows/upload_pr_documentation.yaml
@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: text-generation-inference
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@ -589,14 +589,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"

 [[package]]
-name = "cc"
-version = "1.0.99"
+name = "cast"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "castaway"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0abae9be0aaf9ea96a3b1b8b1b55c602ca751eba1b1500220cea4ecbafe7c0d5"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "cc"
+version = "1.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b62ac837cdb5cb22e10a256099b4fc502b1dfe560cb282963a974d7abd80e476"
 dependencies = [
 "jobserver",
 "libc",
- "once_cell",
+ "shlex",
+]
+
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
 ]

 [[package]]
@ -621,6 +645,34 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"

+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
+[[package]]
+name = "clap"
+version = "2.34.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
+dependencies = [
+ "bitflags 1.3.2",
+ "textwrap",
+ "unicode-width",
+]
+
 [[package]]
 name = "clap"
 version = "4.5.17"
@ -645,21 +697,40 @@ dependencies = [

 [[package]]
 name = "clap_derive"
-version = "4.5.5"
+version = "4.5.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
+checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
 "quote",
- "syn 2.0.66",
+ "syn 2.0.77",
 ]

 [[package]]
 name = "clap_lex"
-version = "0.7.1"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
+checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+
+[[package]]
+name = "cmake"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "codespan-reporting"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
+dependencies = [
+ "termcolor",
+ "unicode-width",
+]

 [[package]]
 name = "color_quant"
@ -1027,15 +1098,10 @@ dependencies = [
 ]

 [[package]]
-name = "displaydoc"
-version = "0.2.4"
+name = "dunce"
+version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.66",
-]
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"

 [[package]]
 name = "easy-cast"
@ -1349,18 +1415,24 @@ dependencies = [

 [[package]]
 name = "gimli"
-version = "0.29.0"
+version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
+checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64"
+
+[[package]]
+name = "glob"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"

 [[package]]
 name = "grpc-metadata"
 version = "0.1.0"
 dependencies = [
- "opentelemetry",
+ "opentelemetry 0.20.0",
 "tonic 0.10.2",
 "tracing",
- "tracing-opentelemetry",
+ "tracing-opentelemetry 0.21.0",
 ]

 [[package]]
@ -1559,9 +1631,9 @@ dependencies = [

 [[package]]
 name = "httparse"
-version = "1.9.3"
+version = "1.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0e7a4dd27b9476dc40cb050d3632d3bba3a70ddbff012285f7f8559a1e7e545"
+checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"

 [[package]]
 name = "httpdate"
@ -1579,7 +1651,7 @@ dependencies = [
 "futures-channel",
 "futures-core",
 "futures-util",
- "h2",
+ "h2 0.3.26",
 "http 0.2.12",
 "http-body 0.4.6",
 "httparse",
@ -1652,128 +1724,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
 "bytes",
- "hyper",
+ "hyper 0.14.30",
 "native-tls",
 "tokio",
 "tokio-native-tls",
 ]

 [[package]]
-name = "icu_collections"
-version = "1.5.0"
+name = "hyper-util"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
+checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba"
 dependencies = [
- "displaydoc",
- "yoke",
- "zerofrom",
- "zerovec",
-]
-
-[[package]]
-name = "icu_locid"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
-dependencies = [
- "displaydoc",
- "litemap",
- "tinystr",
- "writeable",
- "zerovec",
-]
-
-[[package]]
-name = "icu_locid_transform"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
-dependencies = [
- "displaydoc",
- "icu_locid",
- "icu_locid_transform_data",
- "icu_provider",
- "tinystr",
- "zerovec",
-]
-
-[[package]]
-name = "icu_locid_transform_data"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
-
-[[package]]
-name = "icu_normalizer"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
-dependencies = [
- "displaydoc",
- "icu_collections",
- "icu_normalizer_data",
- "icu_properties",
- "icu_provider",
- "smallvec",
- "utf16_iter",
- "utf8_iter",
- "write16",
- "zerovec",
-]
-
-[[package]]
-name = "icu_normalizer_data"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
-
-[[package]]
-name = "icu_properties"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f8ac670d7422d7f76b32e17a5db556510825b29ec9154f235977c9caba61036"
-dependencies = [
- "displaydoc",
- "icu_collections",
- "icu_locid_transform",
- "icu_properties_data",
- "icu_provider",
- "tinystr",
- "zerovec",
-]
-
-[[package]]
-name = "icu_properties_data"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
-
-[[package]]
-name = "icu_provider"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
-dependencies = [
- "displaydoc",
- "icu_locid",
- "icu_provider_macros",
- "stable_deref_trait",
- "tinystr",
- "writeable",
- "yoke",
- "zerofrom",
- "zerovec",
-]
-
-[[package]]
-name = "icu_provider_macros"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.76",
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.1",
+ "hyper 1.4.1",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower",
+ "tower-service",
+ "tracing",
 ]

 [[package]]
@ -1784,14 +1758,12 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"

 [[package]]
 name = "idna"
-version = "1.0.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4716a3a0933a1d01c2f72450e89596eb51dd34ef3c211ccd875acdf1f8fe47ed"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
 dependencies = [
- "icu_normalizer",
- "icu_properties",
- "smallvec",
- "utf8_iter",
+ "unicode-bidi",
+ "unicode-normalization",
 ]

 [[package]]
@ -1879,11 +1851,21 @@ version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17"
 dependencies = [
- "opentelemetry",
+ "opentelemetry 0.20.0",
 "opentelemetry-otlp",
 "thiserror",
 "tracing",
- "tracing-opentelemetry",
+ "tracing-opentelemetry 0.21.0",
+]
+
+[[package]]
+name = "instability"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b23a0c8dfe501baac4adf6ebbfa6eddf8f0c07f56b058cc1288017e32397846c"
+dependencies = [
+ "quote",
+ "syn 2.0.77",
 ]

 [[package]]
@ -1903,14 +1885,14 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.66",
+ "syn 2.0.77",
 ]

 [[package]]
 name = "ipnet"
-version = "2.9.0"
+version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
+checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4"

 [[package]]
 name = "is_terminal_polyfill"
@ -2099,12 +2081,6 @@ version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"

-[[package]]
-name = "litemap"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
-
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@ -3134,8 +3110,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck 0.4.1",
- "itertools 0.10.5",
+ "heck 0.5.0",
+ "itertools 0.12.1",
 "log",
 "multimap",
 "once_cell",
@ -3168,10 +3144,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
 dependencies = [
 "anyhow",
- "itertools 0.10.5",
+ "itertools 0.12.1",
 "proc-macro2",
 "quote",
- "syn 2.0.66",
+ "syn 2.0.77",
 ]

 [[package]]
@ -3427,11 +3403,11 @@ dependencies = [

 [[package]]
 name = "redox_syscall"
-version = "0.5.2"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
+checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853"
 dependencies = [
- "bitflags 2.5.0",
+ "bitflags 2.6.0",
 ]

 [[package]]
@ -3581,23 +3557,22 @@ dependencies = [

 [[package]]
 name = "rust-embed-impl"
-version = "6.8.1"
+version = "8.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49b94b81e5b2c284684141a2fb9e2a31be90638caf040bf9afbc5a0416afe1ac"
+checksum = "6125dbc8867951125eec87294137f4e9c2c96566e61bf72c45095a7c77761478"
 dependencies = [
 "proc-macro2",
 "quote",
 "rust-embed-utils",
- "shellexpand",
- "syn 2.0.66",
+ "syn 2.0.77",
 "walkdir",
 ]

 [[package]]
 name = "rust-embed-utils"
-version = "7.8.1"
+version = "8.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d38ff6bf570dc3bb7100fce9f7b60c33fa71d80e88da3f2580df4ff2bdded74"
+checksum = "2e5347777e9aacb56039b0e1f28785929a8a3b709e87482e7442c72e7c12529d"
 dependencies = [
 "sha2",
 "walkdir",
@ -4024,10 +3999,10 @@ dependencies = [
 ]

 [[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
+name = "static_assertions"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"

 [[package]]
 name = "strsim"
@ -4092,15 +4067,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"

 [[package]]
-name = "synstructure"
-version = "0.13.1"
+name = "sync_wrapper"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.66",
-]
+checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"

 [[package]]
 name = "sysinfo"
@ -4194,12 +4164,44 @@ dependencies = [
 "windows-sys 0.59.0",
 ]

+[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "text-generation-backends-trtllm"
+version = "2.2.1-dev0"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "clap 4.5.17",
+ "cmake",
+ "cxx",
+ "cxx-build",
+ "log",
+ "parking_lot",
+ "pkg-config",
+ "text-generation-router",
+ "thiserror",
+ "tokenizers 0.19.1",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "tracing-opentelemetry 0.24.0",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "text-generation-benchmark"
-version = "2.0.4"
+version = "2.2.1-dev0"
 dependencies = [
 "average",
- "clap",
+ "clap 4.5.17",
 "crossterm",
 "float-ord",
 "hf-hub",
@ -4217,13 +4219,14 @@ dependencies = [

 [[package]]
 name = "text-generation-client"
-version = "2.0.4"
+version = "2.2.1-dev0"
 dependencies = [
+ "async-trait",
+ "base64 0.22.1",
 "futures",
 "grpc-metadata",
 "prost 0.12.6",
 "prost-build",
- "rand",
 "thiserror",
 "tokio",
 "tonic 0.10.2",
@ -4234,13 +4237,13 @@ dependencies = [

 [[package]]
 name = "text-generation-launcher"
-version = "2.0.4"
+version = "2.2.1-dev0"
 dependencies = [
- "clap",
+ "clap 4.5.17",
 "ctrlc",
 "float_eq",
 "hf-hub",
- "nix",
+ "nix 0.28.0",
 "once_cell",
 "reqwest",
 "serde",
@ -4253,13 +4256,14 @@ dependencies = [

 [[package]]
 name = "text-generation-router"
-version = "2.0.4"
+version = "2.2.1-dev0"
 dependencies = [
 "async-stream",
- "axum",
+ "async-trait",
+ "axum 0.7.5",
 "axum-tracing-opentelemetry",
 "base64 0.22.1",
- "clap",
+ "clap 4.5.17",
 "csv",
 "futures",
 "futures-util",
@ -4434,15 +4438,30 @@ dependencies = [
 ]

 [[package]]
-name = "tinystr"
-version = "0.7.6"
+name = "tinytemplate"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
 dependencies = [
- "displaydoc",
- "zerovec",
+ "serde",
+ "serde_json",
 ]

+[[package]]
+name = "tinyvec"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "tokenizers"
 version = "0.19.1"
@ -4675,13 +4694,13 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
 dependencies = [
 "async-stream",
 "async-trait",
- "axum",
+ "axum 0.6.20",
 "base64 0.21.7",
 "bytes",
- "h2",
- "http",
- "http-body",
- "hyper",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.30",
 "hyper-timeout",
 "percent-encoding",
 "pin-project",
@ -4927,10 +4946,25 @@ dependencies = [
 ]

 [[package]]
-name = "unicode-ident"
-version = "1.0.12"
+name = "unicode-bidi"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+dependencies = [
+ "tinyvec",
+]

 [[package]]
 name = "unicode-normalization-alignments"
@ -5010,9 +5044,9 @@ dependencies = [

 [[package]]
 name = "url"
-version = "2.5.1"
+version = "2.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56"
+checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
 dependencies = [
 "form_urlencoded",
 "idna",
@ -5025,18 +5059,6 @@ version = "2.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"

-[[package]]
-name = "utf16_iter"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
-
-[[package]]
-name = "utf8_iter"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
-
 [[package]]
 name = "utf8parse"
 version = "0.2.2"
@ -5057,9 +5079,9 @@ dependencies = [

 [[package]]
 name = "utoipa-gen"
-version = "3.5.0"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d96dcd6fc96f3df9b3280ef480770af1b7c5d14bc55192baa9b067976d920c"
+checksum = "7bf0e16c02bc4bf5322ab65f10ab1149bdbcaa782cba66dc7057370a3f8190be"
 dependencies = [
 "proc-macro-error",
 "proc-macro2",
@ -5070,11 +5092,11 @@ dependencies = [

 [[package]]
 name = "utoipa-swagger-ui"
-version = "3.1.5"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84614caa239fb25b2bb373a52859ffd94605ceb256eeb1d63436325cf81e3653"
+checksum = "0b39868d43c011961e04b41623e050aedf2cc93652562ff7935ce0f819aaf2da"
 dependencies = [
- "axum",
+ "axum 0.7.5",
 "mime_guess",
 "regex",
 "rust-embed",
@ -5291,9 +5313,9 @@ dependencies = [

 [[package]]
 name = "webpki-roots"
-version = "0.26.2"
+version = "0.26.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c452ad30530b54a4d8e71952716a212b08efd0f3562baa66c29a618b07da7c3"
+checksum = "0bd24728e5af82c6c4ec1b66ac4844bdf8156257fccda846ec58b42cd0cdbe6a"
 dependencies = [
 "rustls-pki-types",
 ]
@ -5599,70 +5621,14 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

-[[package]]
-name = "write16"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
-
-[[package]]
-name = "writeable"
-version = "0.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
-
-[[package]]
-name = "yoke"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5"
-dependencies = [
- "serde",
- "stable_deref_trait",
- "yoke-derive",
- "zerofrom",
-]
-
-[[package]]
-name = "yoke-derive"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.66",
- "synstructure",
-]
-
-[[package]]
-name = "zerocopy"
-version = "0.6.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "854e949ac82d619ee9a14c66a1b674ac730422372ccb759ce0c39cabcf2bf8e6"
-dependencies = [
- "byteorder",
- "zerocopy-derive 0.6.6",
-]
-
 [[package]]
 name = "zerocopy"
 version = "0.7.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
 dependencies = [
- "zerocopy-derive 0.7.35",
-]
-
-[[package]]
-name = "zerocopy-derive"
-version = "0.6.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.77",
+ "byteorder",
+ "zerocopy-derive",
 ]

 [[package]]
@ -5673,28 +5639,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
 dependencies = [
 "proc-macro2",
 "quote",
- "syn 2.0.76",
-]
-
-[[package]]
-name = "zerofrom"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55"
-dependencies = [
- "zerofrom-derive",
-]
-
-[[package]]
-name = "zerofrom-derive"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.66",
- "synstructure",
+ "syn 2.0.77",
 ]

 [[package]]
@ -5703,28 +5648,6 @@ version = "1.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"

-[[package]]
-name = "zerovec"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2cc8827d6c0994478a15c53f374f46fbd41bea663d809b14744bc42e6b109c"
-dependencies = [
- "yoke",
- "zerofrom",
- "zerovec-derive",
-]
-
-[[package]]
-name = "zerovec-derive"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97cf56601ee5052b4417d90c8755c6683473c926039908196cf35d99f893ebe7"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.66",
-]
-
 [[package]]
 name = "zip"
 version = "0.6.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -20,7 +20,7 @@ default-members = [
 resolver = "2"

 [workspace.package]
-version = "2.0.4"
+version = "2.3.1-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/37
+++ b/37
@ -1,8 +1,10 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.80 AS chef
 WORKDIR /usr/src

-FROM chef as planner
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
 COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
@ -25,16 +27,19 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    rm -f $PROTOC_ZIP

 COPY --from=planner /usr/src/recipe.json recipe.json
-COPY Cargo.lock Cargo.lock
-RUN cargo chef cook --release --recipe-path recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL

 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt

 # Text Generation Inference base image
 FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as base
@ -70,14 +75,26 @@ RUN cd server && \
    pip install . --no-cache-dir

 # Install benchmarker
-COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
 # Install router
-COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
-COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
+
+COPY sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]

 # Final image
 FROM base

-ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+# CMD ["--json-output"]
--- a/16
+++ b/16
@ -1,9 +1,8 @@
 install-server:
 	cd server && make install

-install-integration-tests:
-	cd integration-tests && pip install -r requirements.txt
-	cd clients/python && pip install .
+install-server-cpu:
+	cd server && make install-server

 install-router:
 	cargo install --path backends/v3/
@ -14,7 +13,10 @@ install-launcher:
 install-benchmark:
 	cargo install --path benchmark/

-install: install-server install-router install-launcher install-custom-kernels
+install: install-server install-router install-launcher
+
+
+install-cpu: install-server-cpu install-router install-launcher

 server-dev:
 	cd server && make run-dev
@ -46,8 +48,8 @@ python-tests: python-server-tests python-client-tests
 run-falcon-7b-instruct:
 	text-generation-launcher --model-id tiiuae/falcon-7b-instruct --port 8080

+run-falcon-7b-instruct-quantize:
+	text-generation-launcher --model-id tiiuae/falcon-7b-instruct --quantize bitsandbytes --port 8080
+
 clean:
 	rm -rf target aml
-
-debug_image_build:
-	docker build --no-cache --progress=plain -t debug_tgi .
--- a/docs/openapi.json
+++ b/docs/openapi.json
--- a/docs/openapi.json.rej
+++ b/docs/openapi.json.rej
@ -1,10 +0,0 @@
-diff a/docs/openapi.json b/docs/openapi.json	(rejected hunks)
-@@ -10,7 +10,7 @@
-       "name": "Apache 2.0",
-       "url": "https://www.apache.org/licenses/LICENSE-2.0"
-     },
-    "version": "2.2.1-dev0"
-+    "version": "2.3.1-dev0"
-   },
-   "paths": {
-     "/": {
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1,5 +1,3 @@
-/// Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
-
 use clap::{Parser, ValueEnum};
 use hf_hub::{
    api::sync::{Api, ApiBuilder},
@ -726,12 +724,13 @@ fn shard_manager(

    if let Some(dtype) = dtype {
        shard_args.push("--dtype".to_string());
-        shard_args.push(dtype.to_string());
+        shard_args.push(dtype.to_string())
    }
+
    // Model optional revision
    if let Some(revision) = revision {
        shard_args.push("--revision".to_string());
-        shard_args.push(revision);
+        shard_args.push(revision)
    }

    let rope = match (rope_scaling, rope_factor) {
@ -1567,16 +1566,6 @@ fn terminate(process_name: &str, mut process: Child, timeout: Duration) -> io::R
 }

 fn main() -> Result<(), LauncherError> {
-    match Command::new("ldconfig").spawn() {
-        Ok(_) => {}
-        Err(err) => {
-            tracing::warn!(
-                "Unable to refresh ldconfig cache. Skipping (useless in most cases). Details {:?}",
-                err
-            )
-        }
-    }
-
    // Pattern match configuration
    let args: Args = Args::parse();

--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -224,7 +224,7 @@ message DecodeResponse {

 message WarmupRequest {
    /// Batch to warmup on
-    repeated Batch batches = 1;
+    Batch batch = 1;
    uint32 max_input_length = 2;
    uint32 max_prefill_tokens = 3;
    uint32 max_total_tokens = 4;
--- a/router/client/src/pb/generate.v2.rs
+++ b/router/client/src/pb/generate.v2.rs
@ -1,647 +0,0 @@
-// This file is @generated by prost-build.
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct HealthRequest {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct HealthResponse {}
-/// / Empty request
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct InfoRequest {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct InfoResponse {
-    #[prost(bool, tag = "1")]
-    pub requires_padding: bool,
-    #[prost(string, tag = "2")]
-    pub dtype: ::prost::alloc::string::String,
-    #[prost(string, tag = "3")]
-    pub device_type: ::prost::alloc::string::String,
-    #[prost(uint32, optional, tag = "4")]
-    pub window_size: ::core::option::Option<u32>,
-    #[prost(uint32, tag = "5")]
-    pub speculate: u32,
-}
-/// / Empty request
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ServiceDiscoveryRequest {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ServiceDiscoveryResponse {
-    /// / Other shards urls
-    #[prost(string, repeated, tag = "1")]
-    pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ClearCacheRequest {
-    /// / Optional batch id
-    #[prost(uint64, optional, tag = "1")]
-    pub id: ::core::option::Option<u64>,
-}
-/// / Empty response
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct ClearCacheResponse {}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct NextTokenChooserParameters {
-    /// / exponential scaling output probability distribution
-    #[prost(float, tag = "1")]
-    pub temperature: f32,
-    /// / restricting to the k highest probability elements
-    #[prost(uint32, tag = "2")]
-    pub top_k: u32,
-    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
-    #[prost(float, tag = "3")]
-    pub top_p: f32,
-    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
-    #[prost(float, tag = "4")]
-    pub typical_p: f32,
-    /// / apply sampling on the logits
-    #[prost(bool, tag = "5")]
-    pub do_sample: bool,
-    /// / random seed for sampling
-    #[prost(uint64, tag = "6")]
-    pub seed: u64,
-    /// / repetition penalty
-    #[prost(float, tag = "7")]
-    pub repetition_penalty: f32,
-    /// / frequency penalty
-    #[prost(float, tag = "9")]
-    pub frequency_penalty: f32,
-    /// / token watermarking using "A Watermark for Large Language Models"
-    #[prost(bool, tag = "8")]
-    pub watermark: bool,
-    /// / grammar (applied if not empty)
-    #[prost(string, tag = "10")]
-    pub grammar: ::prost::alloc::string::String,
-    /// / grammar type
-    #[prost(enumeration = "GrammarType", tag = "11")]
-    pub grammar_type: i32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct StoppingCriteriaParameters {
-    /// / Maximum number of generated tokens
-    #[prost(uint32, tag = "1")]
-    pub max_new_tokens: u32,
-    /// / Optional stopping sequences
-    #[prost(string, repeated, tag = "2")]
-    pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
-    /// / Ignore end of sequence token
-    /// / used for benchmarking
-    #[prost(bool, tag = "3")]
-    pub ignore_eos_token: bool,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Request {
-    /// / Request ID
-    #[prost(uint64, tag = "1")]
-    pub id: u64,
-    /// / The generation context
-    #[prost(string, tag = "2")]
-    pub inputs: ::prost::alloc::string::String,
-    /// / Context truncation
-    #[prost(uint32, tag = "3")]
-    pub truncate: u32,
-    /// / Next Token Chooser Parameters
-    #[prost(message, optional, tag = "4")]
-    pub parameters: ::core::option::Option<NextTokenChooserParameters>,
-    /// / Stopping Criteria Parameters
-    #[prost(message, optional, tag = "5")]
-    pub stopping_parameters: ::core::option::Option<StoppingCriteriaParameters>,
-    /// / Return prefill logprobs
-    #[prost(bool, tag = "6")]
-    pub prefill_logprobs: bool,
-    /// / Return most likely n tokens
-    #[prost(uint32, tag = "7")]
-    pub top_n_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Batch {
-    /// / Batch ID
-    #[prost(uint64, tag = "1")]
-    pub id: u64,
-    /// / Individual requests
-    #[prost(message, repeated, tag = "2")]
-    pub requests: ::prost::alloc::vec::Vec<Request>,
-    /// / Batch size (==len(requests))
-    #[prost(uint32, tag = "3")]
-    pub size: u32,
-    /// / Maximum number of tokens this batch will grow to
-    #[prost(uint32, tag = "4")]
-    pub max_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct CachedBatch {
-    /// / Batch ID
-    #[prost(uint64, tag = "1")]
-    pub id: u64,
-    /// / Individual requests ids
-    #[prost(uint64, repeated, tag = "2")]
-    pub request_ids: ::prost::alloc::vec::Vec<u64>,
-    /// / Batch size (==len(requests))
-    #[prost(uint32, tag = "3")]
-    pub size: u32,
-    /// / Maximum number of tokens this batch will grow to
-    #[prost(uint32, tag = "4")]
-    pub max_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct GeneratedText {
-    /// / Output
-    #[prost(string, tag = "1")]
-    pub text: ::prost::alloc::string::String,
-    /// / Number of generated tokens
-    #[prost(uint32, tag = "2")]
-    pub generated_tokens: u32,
-    /// / Finish reason
-    #[prost(enumeration = "FinishReason", tag = "3")]
-    pub finish_reason: i32,
-    /// / Seed
-    #[prost(uint64, optional, tag = "4")]
-    pub seed: ::core::option::Option<u64>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Tokens {
-    /// / Token IDs
-    #[prost(uint32, repeated, tag = "1")]
-    pub ids: ::prost::alloc::vec::Vec<u32>,
-    /// / Logprobs
-    #[prost(float, repeated, tag = "2")]
-    pub logprobs: ::prost::alloc::vec::Vec<f32>,
-    /// / tokens
-    #[prost(string, repeated, tag = "3")]
-    pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
-    /// / special
-    #[prost(bool, repeated, tag = "4")]
-    pub is_special: ::prost::alloc::vec::Vec<bool>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct Generation {
-    /// / Request ID
-    #[prost(uint64, tag = "1")]
-    pub request_id: u64,
-    /// / Prefill tokens (optional)
-    #[prost(message, optional, tag = "2")]
-    pub prefill_tokens: ::core::option::Option<Tokens>,
-    #[prost(message, optional, tag = "3")]
-    pub tokens: ::core::option::Option<Tokens>,
-    /// / Complete generated text
-    #[prost(message, optional, tag = "4")]
-    pub generated_text: ::core::option::Option<GeneratedText>,
-    /// / Top tokens
-    #[prost(message, repeated, tag = "5")]
-    pub top_tokens: ::prost::alloc::vec::Vec<Tokens>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct FilterBatchRequest {
-    /// / Batch ID
-    #[prost(uint64, tag = "1")]
-    pub batch_id: u64,
-    /// / Requests to keep
-    #[prost(uint64, repeated, tag = "2")]
-    pub request_ids: ::prost::alloc::vec::Vec<u64>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct FilterBatchResponse {
-    /// / Filtered Batch (cached)
-    #[prost(message, optional, tag = "1")]
-    pub batch: ::core::option::Option<CachedBatch>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct PrefillRequest {
-    /// / Batch
-    #[prost(message, optional, tag = "1")]
-    pub batch: ::core::option::Option<Batch>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct PrefillResponse {
-    /// / Generation
-    #[prost(message, repeated, tag = "1")]
-    pub generations: ::prost::alloc::vec::Vec<Generation>,
-    /// / Next batch (cached)
-    #[prost(message, optional, tag = "2")]
-    pub batch: ::core::option::Option<CachedBatch>,
-    /// / Forward elapsed time in nanoseconds
-    #[prost(uint64, tag = "3")]
-    pub forward_ns: u64,
-    /// / Decode elapsed time in nanoseconds
-    #[prost(uint64, tag = "4")]
-    pub decode_ns: u64,
-    /// / Total elapsed time in nanoseconds
-    #[prost(uint64, tag = "5")]
-    pub total_ns: u64,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct DecodeRequest {
-    /// / Cached batches
-    #[prost(message, repeated, tag = "1")]
-    pub batches: ::prost::alloc::vec::Vec<CachedBatch>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct DecodeResponse {
-    /// / Decodes
-    #[prost(message, repeated, tag = "1")]
-    pub generations: ::prost::alloc::vec::Vec<Generation>,
-    /// / Next batch (cached)
-    #[prost(message, optional, tag = "2")]
-    pub batch: ::core::option::Option<CachedBatch>,
-    /// / Forward elapsed time in nanoseconds
-    #[prost(uint64, tag = "3")]
-    pub forward_ns: u64,
-    /// / Decode elapsed time in nanoseconds
-    #[prost(uint64, tag = "4")]
-    pub decode_ns: u64,
-    /// / Total elapsed time in nanoseconds
-    #[prost(uint64, tag = "5")]
-    pub total_ns: u64,
-    /// / Concatenate elapsed time in nanoseconds
-    #[prost(uint64, optional, tag = "6")]
-    pub concat_ns: ::core::option::Option<u64>,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct WarmupRequest {
-    /// / Batch to warmup on
-    #[prost(message, optional, tag = "1")]
-    pub batch: ::core::option::Option<Batch>,
-    #[prost(uint32, tag = "2")]
-    pub max_input_length: u32,
-    #[prost(uint32, tag = "3")]
-    pub max_prefill_tokens: u32,
-    #[prost(uint32, tag = "4")]
-    pub max_total_tokens: u32,
-}
-#[allow(clippy::derive_partial_eq_without_eq)]
-#[derive(Clone, PartialEq, ::prost::Message)]
-pub struct WarmupResponse {
-    /// / Maximum number of tokens supported by the model
-    #[prost(uint32, optional, tag = "1")]
-    pub max_supported_total_tokens: ::core::option::Option<u32>,
-}
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
-#[repr(i32)]
-pub enum GrammarType {
-    None = 0,
-    Json = 1,
-    Regex = 2,
-}
-impl GrammarType {
-    /// String value of the enum field names used in the ProtoBuf definition.
-    ///
-    /// The values are not transformed in any way and thus are considered stable
-    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
-    pub fn as_str_name(&self) -> &'static str {
-        match self {
-            GrammarType::None => "GRAMMAR_TYPE_NONE",
-            GrammarType::Json => "GRAMMAR_TYPE_JSON",
-            GrammarType::Regex => "GRAMMAR_TYPE_REGEX",
-        }
-    }
-    /// Creates an enum from field names used in the ProtoBuf definition.
-    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
-        match value {
-            "GRAMMAR_TYPE_NONE" => Some(Self::None),
-            "GRAMMAR_TYPE_JSON" => Some(Self::Json),
-            "GRAMMAR_TYPE_REGEX" => Some(Self::Regex),
-            _ => None,
-        }
-    }
-}
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
-#[repr(i32)]
-pub enum FinishReason {
-    Length = 0,
-    EosToken = 1,
-    StopSequence = 2,
-}
-impl FinishReason {
-    /// String value of the enum field names used in the ProtoBuf definition.
-    ///
-    /// The values are not transformed in any way and thus are considered stable
-    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
-    pub fn as_str_name(&self) -> &'static str {
-        match self {
-            FinishReason::Length => "FINISH_REASON_LENGTH",
-            FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN",
-            FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE",
-        }
-    }
-    /// Creates an enum from field names used in the ProtoBuf definition.
-    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
-        match value {
-            "FINISH_REASON_LENGTH" => Some(Self::Length),
-            "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken),
-            "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence),
-            _ => None,
-        }
-    }
-}
-/// Generated client implementations.
-pub mod text_generation_service_client {
-    #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
-    use tonic::codegen::*;
-    use tonic::codegen::http::Uri;
-    #[derive(Debug, Clone)]
-    pub struct TextGenerationServiceClient<T> {
-        inner: tonic::client::Grpc<T>,
-    }
-    impl TextGenerationServiceClient<tonic::transport::Channel> {
-        /// Attempt to create a new client by connecting to a given endpoint.
-        pub async fn connect<D>(dst: D) -> Result<Self, tonic::transport::Error>
-        where
-            D: TryInto<tonic::transport::Endpoint>,
-            D::Error: Into<StdError>,
-        {
-            let conn = tonic::transport::Endpoint::new(dst)?.connect().await?;
-            Ok(Self::new(conn))
-        }
-    }
-    impl<T> TextGenerationServiceClient<T>
-    where
-        T: tonic::client::GrpcService<tonic::body::BoxBody>,
-        T::Error: Into<StdError>,
-        T::ResponseBody: Body<Data = Bytes> + Send + 'static,
-        <T::ResponseBody as Body>::Error: Into<StdError> + Send,
-    {
-        pub fn new(inner: T) -> Self {
-            let inner = tonic::client::Grpc::new(inner);
-            Self { inner }
-        }
-        pub fn with_origin(inner: T, origin: Uri) -> Self {
-            let inner = tonic::client::Grpc::with_origin(inner, origin);
-            Self { inner }
-        }
-        pub fn with_interceptor<F>(
-            inner: T,
-            interceptor: F,
-        ) -> TextGenerationServiceClient<InterceptedService<T, F>>
-        where
-            F: tonic::service::Interceptor,
-            T::ResponseBody: Default,
-            T: tonic::codegen::Service<
-                http::Request<tonic::body::BoxBody>,
-                Response = http::Response<
-                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
-                >,
-            >,
-            <T as tonic::codegen::Service<
-                http::Request<tonic::body::BoxBody>,
-            >>::Error: Into<StdError> + Send + Sync,
-        {
-            TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor))
-        }
-        /// Compress requests with the given encoding.
-        ///
-        /// This requires the server to support it otherwise it might respond with an
-        /// error.
-        #[must_use]
-        pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self {
-            self.inner = self.inner.send_compressed(encoding);
-            self
-        }
-        /// Enable decompressing responses.
-        #[must_use]
-        pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self {
-            self.inner = self.inner.accept_compressed(encoding);
-            self
-        }
-        /// Limits the maximum size of a decoded message.
-        ///
-        /// Default: `4MB`
-        #[must_use]
-        pub fn max_decoding_message_size(mut self, limit: usize) -> Self {
-            self.inner = self.inner.max_decoding_message_size(limit);
-            self
-        }
-        /// Limits the maximum size of an encoded message.
-        ///
-        /// Default: `usize::MAX`
-        #[must_use]
-        pub fn max_encoding_message_size(mut self, limit: usize) -> Self {
-            self.inner = self.inner.max_encoding_message_size(limit);
-            self
-        }
-        /// / Model Info
-        pub async fn info(
-            &mut self,
-            request: impl tonic::IntoRequest<super::InfoRequest>,
-        ) -> std::result::Result<tonic::Response<super::InfoResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Info",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Service discovery
-        pub async fn service_discovery(
-            &mut self,
-            request: impl tonic::IntoRequest<super::ServiceDiscoveryRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::ServiceDiscoveryResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/ServiceDiscovery",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(
-                    GrpcMethod::new(
-                        "generate.v2.TextGenerationService",
-                        "ServiceDiscovery",
-                    ),
-                );
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Empties batch cache
-        pub async fn clear_cache(
-            &mut self,
-            request: impl tonic::IntoRequest<super::ClearCacheRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::ClearCacheResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/ClearCache",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(
-                    GrpcMethod::new("generate.v2.TextGenerationService", "ClearCache"),
-                );
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Remove requests from a cached batch
-        pub async fn filter_batch(
-            &mut self,
-            request: impl tonic::IntoRequest<super::FilterBatchRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::FilterBatchResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/FilterBatch",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(
-                    GrpcMethod::new("generate.v2.TextGenerationService", "FilterBatch"),
-                );
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Warmup the model and compute max cache size
-        pub async fn warmup(
-            &mut self,
-            request: impl tonic::IntoRequest<super::WarmupRequest>,
-        ) -> std::result::Result<tonic::Response<super::WarmupResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Warmup",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Warmup"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Prefill batch and decode first token
-        pub async fn prefill(
-            &mut self,
-            request: impl tonic::IntoRequest<super::PrefillRequest>,
-        ) -> std::result::Result<
-            tonic::Response<super::PrefillResponse>,
-            tonic::Status,
-        > {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Prefill",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Prefill"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Decode token for a list of prefilled batches
-        pub async fn decode(
-            &mut self,
-            request: impl tonic::IntoRequest<super::DecodeRequest>,
-        ) -> std::result::Result<tonic::Response<super::DecodeResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Decode",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Decode"));
-            self.inner.unary(req, path, codec).await
-        }
-        /// / Health check
-        pub async fn health(
-            &mut self,
-            request: impl tonic::IntoRequest<super::HealthRequest>,
-        ) -> std::result::Result<tonic::Response<super::HealthResponse>, tonic::Status> {
-            self.inner
-                .ready()
-                .await
-                .map_err(|e| {
-                    tonic::Status::new(
-                        tonic::Code::Unknown,
-                        format!("Service was not ready: {}", e.into()),
-                    )
-                })?;
-            let codec = tonic::codec::ProstCodec::default();
-            let path = http::uri::PathAndQuery::from_static(
-                "/generate.v2.TextGenerationService/Health",
-            );
-            let mut req = request.into_request();
-            req.extensions_mut()
-                .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Health"));
-            self.inner.unary(req, path, codec).await
-        }
-    }
-}
--- a/router/client/src/pb/mod.rs
+++ b/router/client/src/pb/mod.rs
@ -1,6 +0,0 @@
-// This file is @generated by prost-build.
-pub mod generate {
-    pub mod v2 {
-        include!("generate.v2.rs");
-    }
-}
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "2.0.4"
+version = "2.0.5-dev0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]

--- a/server/text_generation_server/layers/gptq/exllamav2.py
+++ b/server/text_generation_server/layers/gptq/exllamav2.py
@ -9,11 +9,15 @@ from loguru import logger

 from text_generation_server.layers.exl2 import Exl2Weight
 from text_generation_server.layers.gptq import GPTQWeight
+from text_generation_server.utils.log import log_master

 try:
-    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
+    from exllamav2.ext import exllamav2_ext
+
+    make_q_matrix = exllamav2_ext.make_q_matrix
+    gemm_half_q_half = exllamav2_ext.gemm_half_q_half
 except ImportError:
-    logger.error("exllamav2_kernels not installed.")
+    log_master(logger.warning, "exllamav2_kernels not installed.")
    raise

 # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
@ -69,6 +73,10 @@ def ext_make_q_matrix(
    """
    Create Q matrix
    """
+    # max_dq_size = 512*(1024**2)
+    # max_dq_rows = max_dq_size // out_features[0]
+    max_dq_rows = 0
+
    # EXL2
    if isinstance(w, Exl2Weight):
        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
@ -82,10 +90,12 @@ def ext_make_q_matrix(
            w.q_scale_max,
            w.q_groups,
            extra.q_group_map,
-            none_tensor,
-            none_tensor,
-            none_tensor,
+            none_tensor,  # zeros
+            none_tensor,  # scales
+            none_tensor,  # g_idx
+            none_tensor,  # bias
            temp_dq,
+            max_dq_rows,
        )
    # GPTQ
    elif isinstance(w, GPTQWeight):
@ -105,29 +115,33 @@ def ext_make_q_matrix(
                w.qweight,
                extra.q_perm,
                extra.q_invperm,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
                w.qzeros,
                w.scales,
                w.g_idx.cpu(),
+                none_tensor,  # bias
                temp_dq,
+                max_dq_rows,
            )
        # GPTQ without g_idx
        else:
            return make_q_matrix(
                w.qweight,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
-                none_tensor,
+                none_tensor,  # q_perm
+                none_tensor,  # q_invperm
+                none_tensor,  # q_scale
+                none_tensor,  # q_scale_max
+                none_tensor,  # q_groups
+                none_tensor,  # q_group_map
                w.qzeros,
                w.scales,
-                none_tensor,
+                none_tensor,  # g_idx
+                none_tensor,  # bias
                temp_dq,
+                max_dq_rows,
            )
    else:
        RuntimeError("Cannot create handle")
--- a/server/text_generation_server/layers/gptq/quant_linear.py
+++ b/server/text_generation_server/layers/gptq/quant_linear.py
@ -206,6 +206,7 @@ def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
        output = torch.empty(
            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
        )
+
        def grid(META):
            return (
                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
+
+text-generation-launcher $@