[gaudi] Gemma3 sliding window support (#3280 )

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Gaudi: add CI (#3160 )
2025-07-01 21:40:16 +00:00 · 2025-07-01 10:06:01 +02:00 · 2025-06-24 18:51:09 +02:00 · 2025-06-23 11:15:39 +02:00 · 2025-06-19 10:52:41 +02:00 · 2025-06-19 09:32:34 +02:00
876 changed files with 170604 additions and 19161 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -2,3 +2,6 @@ aml
 target
 server/transformers
 server/flash-attention
+cmake-build-debug/
+cmake-build-release/
+Dockerfile*
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -5,14 +5,14 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: | 
+      description: |
        Please share your system info with us (`text-generation-launcher --env` if installed locally).
-        The full command line used that causes issues: 
+        The full command line used that causes issues:
        OS version:
        Rust version (if self-compiling, `cargo version`):
        Model being used (`curl 127.0.0.1:8080/info | jq`):
          If local model please explicit the kind of model and/or equivalents.
-        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
        Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
        The current version being used:

@ -52,11 +52,11 @@ body:

      placeholder: |
        Steps to reproduce the behavior:
-          
+
          1.
          2.
          3.
-          
+

  - type: textarea
    id: expected-behavior
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -19,7 +19,7 @@ body:
      label: Motivation
      description: |
        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+

  - type: textarea
    id: contribution
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@ -0,0 +1,45 @@
+name: Automatic Documentation for Launcher
+
+on:
+  pull_request:
+
+jobs:
+  update_docs:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        profile: minimal
+        toolchain: stable
+
+    - name: Install Protocol Buffers compiler
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y protobuf-compiler libprotobuf-dev
+
+    - name: Install Launcher
+      id: install-launcher
+      run: cargo install --path launcher/
+
+    - name: Install router
+      id: install-router
+      run: cargo install --path backends/v3/
+
+    - uses: actions/setup-node@v4
+      with:
+        node-version: 22
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Check that documentation is up-to-date
+      run: |
+        npm install -g @redocly/cli
+        python update_doc.py --check
--- a/.github/workflows/autodocs.yml
+++ b/.github/workflows/autodocs.yml
@ -1,21 +0,0 @@
-name: Automatic Documentation for Launcher
-
-on:
-  pull_request:
-
-jobs:
-  update_docs:
-    runs-on: ubuntu-latest
-    
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v2
-    
-    - name: Install Launcher
-      id: install-launcher
-      run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher
-    
-    - name: Check launcher Docs are up-to-date
-      run: |
-        echo text-generation-launcher --help
-        python update_doc.py --check
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -1,104 +1,183 @@
 name: Build and push docker image to internal registry

 on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'main'
-    tags:
-      - 'v*'
-  pull_request:
-    paths:
-      - ".github/workflows/build.yaml"
-      - "integration-tests/**"
-      - "server/**"
-      - "proto/**"
-      - "router/**"
-      - "launcher/**"
-      - "Cargo.lock"
-      - "rust-toolchain.toml"
-      - "Dockerfile"
-    branches:
-      - 'main'
+  workflow_call:
+    inputs:
+      hardware:
+        type: string
+        description: Hardware
+        # options:
+        # - cuda
+        # - cuda-trtllm
+        # - rocm
+        # - intel
+        required: true
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean

 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ami-03cfed9ea28f4b002
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
-      EC2_SECURITY_GROUP: sg-030175c435ac141d6
+  build-and-push:
    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
-  build-and-push-image:
+      docker_image: ${{ steps.final.outputs.docker_image }}
+      docker_volume: ${{ steps.final.outputs.docker_volume }}
+      docker_devices: ${{ steps.final.outputs.docker_devices }}
+      runs_on: ${{ steps.final.outputs.runs_on }}
+      label_extension: ${{ steps.final.outputs.label_extension }}
+      extra_pytest: ${{ steps.final.outputs.extra_pytest }}
    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on:
+      group: aws-highmemory-64-plus-priv
    permissions:
      contents: write
      packages: write
-      # This is used to complete the identity challenge
-      # with sigstore/fulcio when running outside of PRs.
      id-token: write
-      security-events: write
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Initialize Docker Buildx
-        uses: docker/setup-buildx-action@v2.0.0
-        with:
-          install: true
+        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
+      - name: Inject required variables for sccache to interact with Github Actions Cache
+        uses: actions/github-script@v7
        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+          script: |
+            core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: Extract TensorRT-LLM version
+        run: |
+          echo "TENSORRT_LLM_VERSION=$(grep -oP '([a-z,0-9]{40})' $GITHUB_WORKSPACE/backends/trtllm/cmake/trtllm.cmake)" >> $GITHUB_ENV
+          echo "TensorRT-LLM version: ${{ env.TENSORRT_LLM_VERSION }}"
+      - name: Construct hardware variables
+        shell: bash
+        run: |
+          case ${{ inputs.hardware }} in
+            cuda)
+                export dockerfile="Dockerfile"
+                export label_extension=""
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="aws-g6-12xl-plus-priv-cache"
+                export platform=""
+                export extra_pytest=""
+                export target=""
+                ;;
+            cuda-trtllm)
+                export dockerfile="Dockerfile_trtllm"
+                export label_extension="-trtllm"
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                export platform=""
+                export extra_pytest=""
+                if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
+                  export build_type="release";
+                  export target="";
+                else
+                  export build_type="dev";
+                  export target="ci-runtime";
+                fi
+                ;;
+            rocm)
+                export dockerfile="Dockerfile_amd"
+                export label_extension="-rocm"
+                export docker_devices="/dev/kfd,/dev/dri"
+                export docker_volume="/mnt"
+                # This runner was deactivated.
+                export runs_on="ubuntu-latest"
+                export platform=""
+                export extra_pytest="-k test_flash_gemma_gptq_load"
+                export target=""
+                ;;
+            intel-xpu)
+                export dockerfile="Dockerfile_intel"
+                export label_extension="-intel-xpu"
+                export docker_devices=""
+                export docker_volume="/mnt/cache"
+                export runs_on="ubuntu-latest"
+                export platform="xpu"
+                export extra_pytest=""
+                export target=""
+                ;;
+            intel-cpu)
+                export dockerfile="Dockerfile_intel"
+                export label_extension="-intel-cpu"
+                export docker_devices="none"
+                export docker_volume="/mnt/cache"
+                # export runs_on="ubuntu-latest"
+                export runs_on="aws-highmemory-32-plus-priv"
+                export platform="cpu"
+                export extra_pytest="-k test_flash_gemma_simple"
+                export target=""
+                ;;
+            neuron)
+                export dockerfile="Dockerfile.neuron"
+                export label_extension="-neuron"
+                export docker_devices="/dev/neuron0"
+                export docker_volume="/mnt/cache"
+                export runs_on="aws-inf2-8xlarge"
+                export platform="cpu"
+                export extra_pytest="--neuron"
+                export target=""
+                ;;
+            gaudi)
+                export dockerfile="Dockerfile_gaudi"
+                export label_extension="-gaudi"
+                export docker_volume="/mnt/cache"
+                export docker_devices=""
+                export runs_on="itac-bm-emr-gaudi3-dell-2gaudi"
+                export platform=""
+                export extra_pytest="--gaudi"
+                export target=""
+          esac
+          echo $dockerfile
+          echo "Dockerfile=${dockerfile}"
+          echo $label_extension
+          echo $docker_devices
+          echo $runs_on
+          echo $platform
+          echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
+          echo "LABEL_EXTENSION=${label_extension}" >> $GITHUB_ENV
+          echo "PLATFORM=${platform}" >> $GITHUB_ENV
+          echo "DOCKER_VOLUME=${docker_volume}" >> $GITHUB_ENV
+          echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
+          echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
+          echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
+          echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
+          echo "TARGET=${target}" >> $GITHUB_ENV
+          echo "BUILD_TYPE=${build_type}" >> $GITHUB_ENV
+      - name: Initialize Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          install: true
+          buildkitd-config: /tmp/buildkitd.toml
+      - name: Login to internal Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.REGISTRY_USERNAME }}
+          password: ${{ secrets.REGISTRY_PASSWORD }}
+          registry: registry.internal.huggingface.tech
      - name: Login to GitHub Container Registry
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Login to internal Container Registry
-        uses: docker/login-action@v2.1.0
+      - name: Login to Docker Hub Container Registry
+        uses: docker/login-action@v3
        with:
-          username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
-          password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
-          registry: registry.internal.huggingface.tech
+          registry: docker.io
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      - name: Login to Azure Container Registry
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2.1.0
+        uses: docker/login-action@v3
        with:
          username: ${{ secrets.AZURE_DOCKER_USERNAME }}
          password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
@ -107,12 +186,12 @@ jobs:
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name == 'pull_request' }}
        id: meta-pr
-        uses: docker/metadata-action@v4.3.0
+        uses: docker/metadata-action@v5
        with:
          images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            docker.io/huggingface/text-generation-inference-ci
          tags: |
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
      # If main, release or tag
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name != 'pull_request' }}
@ -120,89 +199,129 @@ jobs:
        uses: docker/metadata-action@v4.3.0
        with:
          flavor: |
-            latest=auto
+            latest=false
          images: |
            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
            ghcr.io/huggingface/text-generation-inference
            db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
          tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+            type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
+            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
+            type=raw,value=latest${{ env.LABEL_EXTENSION }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
      - name: Build and push Docker image
        id: build-and-push
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: Dockerfile
+          file: ${{ env.DOCKERFILE }}
          push: true
          platforms: 'linux/amd64'
          build-args: |
            GIT_SHA=${{ env.GITHUB_SHA }}
-            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL_EXTENSION }}
+            PLATFORM=${{ env.PLATFORM }}
+            build_type=${{ env.BUILD_TYPE }}
+            sccache_gha_enabled=on
+            actions_results_url=${{ env.ACTIONS_RESULTS_URL }}
+            actions_runtime_token=${{ env.ACTIONS_RUNTIME_TOKEN }}
+          target: ${{ env.TARGET }}
          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
-          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
+          cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL_EXTENSION }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+          cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL_EXTENSION }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
+      - name: Final
+        id: final
+        run: |

-  integration-tests:
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "docker_image=docker.io/huggingface/text-generation-inference-ci:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "docker_image=ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
+          fi
+          echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
+          echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT"
+          echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
+          echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
+          echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
+  precompile_neuron_models:
    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs:
-      - start-runner
-      - build-and-push-image # Wait for the docker image to be built
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    needs: build-and-push
+    if: needs.build-and-push.outputs.label_extension == '-neuron'
+    runs-on:
+      group: ${{ needs.build-and-push.outputs.runs_on }}
    env:
-      DOCKER_VOLUME: /cache
+      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
    steps:
-      - uses: actions/checkout@v2
+      - name: Checkout repository
+        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
-      - name: Prepare disks
+          python-version: "3.11"
+      - name: Install
        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
+          make install-integration-tests
+      - name: Export neuron models
+        run: |
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          echo $DOCKER_IMAGE
+          docker pull $DOCKER_IMAGE
+          export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
+          python integration-tests/fixtures/neuron/export_models.py
+  integration_tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    needs: [precompile_neuron_models, build-and-push]
+    if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
+    runs-on:
+      group: ${{ needs.build-and-push.outputs.runs_on }}
+    env:
+      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
      - name: Install
        run: |
          make install-integration-tests
      - name: Run tests
        run: |
-          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv integration-tests
+          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
+          export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          echo $DOCKER_IMAGE
+          docker pull $DOCKER_IMAGE
+          pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
+
+  backend_trtllm_cxx_tests:
+    needs: build-and-push
+    if: needs.build-and-push.outputs.label_extension == '-trtllm'
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-trtllm-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on:
+      group: aws-g6-12xl-plus-priv-cache
+    container:
+      image: ${{ needs.build-and-push.outputs.docker_image }}
+      credentials:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      options: --gpus all --shm-size=8g

-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - build-and-push-image
-      - integration-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+      - name: Run C++/CUDA tests
+        if: ${{ env.LABEL_EXTENSION == 'ci-runtime' }}
+        run: /usr/local/tgi/bin/tgi_trtllm_backend_tests
--- a/.github/workflows/build_documentation.yaml
+++ b/.github/workflows/build_documentation.yaml
--- a/.github/workflows/build_pr_documentation.yaml
+++ b/.github/workflows/build_pr_documentation.yaml
@ -16,4 +16,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: text-generation-inference
-      additional_args: --not_python_module 
+      additional_args: --not_python_module
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@ -0,0 +1,52 @@
+name: CI build
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+  pull_request:
+    paths:
+      - ".github/workflows/build.yaml"
+      - "integration-tests/**"
+      - "backends/**"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - "Dockerfile"
+      - "Dockerfile_amd"
+      - "Dockerfile_intel"
+      - "Dockerfile.neuron"
+      - "Dockerfile_gaudi"
+    branches:
+      - "main"
+  workflow_dispatch:
+    inputs:
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean
+
+jobs:
+  build:
+    strategy:
+      # super important if you want to see all results, even if one fails
+      # fail-fast is true by default
+      fail-fast: false
+      matrix:
+        hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu", "neuron", "gaudi"]
+    uses: ./.github/workflows/build.yaml # calls the one above ^
+    permissions:
+      contents: write
+      packages: write
+      id-token: write
+    with:
+      hardware: ${{ matrix.hardware }}
+      # https://github.com/actions/runner/issues/2206
+      release-tests: ${{ inputs.release-tests == true }}
+    secrets: inherit
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@ -22,4 +22,5 @@ jobs:
      - name: Run tests
        run: |
          pip install pytest pytest-asyncio
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          make python-client-tests
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@ -1,12 +0,0 @@
-name: Delete doc comment
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
--- a/.github/workflows/integration_tests.yaml
+++ b/.github/workflows/integration_tests.yaml
@ -0,0 +1,41 @@
+name: Integration tests
+
+on:
+  workflow_call:
+    inputs:
+      docker_image:
+        type: string
+        description: Hardware
+        required: true
+      docker_devices:
+        type: string
+        description: Hardware
+      runs_on:
+        type: string
+        required: true
+        description: Hardware to run integration tests
+jobs:
+  integration_tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on: ${{ inputs.runs_on }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_VOLUME=/mnt/cache
+          export DOCKER_IMAGE=${{ inputs.docker_image }}
+          export DOCKER_DEVICES=${{ inputs.docker_devices }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          pytest -s -vv integration-tests
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -3,106 +3,48 @@ name: Nightly load test
 on:
  schedule:
    - cron: '0 0 * * 1-5'
+  workflow_call:
+  workflow_dispatch:

  pull_request:
    paths:
      - ".github/workflows/load_test.yaml"
-    branches:
-      - 'main'
+
+env:
+  AWS_DEFAULT_REGION: us-east-1
+  AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}

 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-      EC2_AMI_ID: ami-0ab09c07cfd194259
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
-      EC2_SECURITY_GROUP: sg-072f92ae3082936c6
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
  load-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on:
+      group: aws-g6-12xl-plus-priv-cache
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
-
-      - name: Install k6
-        run: |
-          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
-
-      - name: Start starcoder
-        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
-          sleep 10
-          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
-
-      - name: Run k6
-        run: |
-          ./k6 run load_tests/starcoder_load.js
-
-      - name: Stop starcoder
-        if: ${{ always() }}
-        run: |
-          docker stop tgi-starcoder || true
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - load-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
+      - name: Install Python 3.11
+        uses: actions/setup-python@v2
        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+          python-version: 3.11
+
+      - name: Install poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
+          poetry --version
+
+      - name: Run bench test
+        run: |
+          export PATH="$HOME/.local/bin:$PATH"
+          cd load_tests
+          poetry install
+          poetry run python benchmarks.py --sha ${{ github.sha }} --results-file "s3://text-generation-inference-ci/benchmarks/ci/${{ github.sha }}.parquet"
+        shell: bash
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}
--- a/.github/workflows/nix_build.yaml
+++ b/.github/workflows/nix_build.yaml
@ -0,0 +1,53 @@
+name: "Nix Build Docker image"
+on:
+  pull_request:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+concurrency:
+  group: nix-image-${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build_nix_image:
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    steps:
+    - uses: actions/checkout@v4
+    - uses: cachix/install-nix-action@v27
+      with:
+        nix_path: nixpkgs=channel:nixos-unstable
+    - uses: cachix/cachix-action@v14
+      with:
+        name: huggingface
+        # If you chose signing key for write access
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+      env:
+        USER: github_runner
+    - name: Build
+      run: nix build .#dockerImage
+    - name: Initialize Docker Buildx
+      uses: docker/setup-buildx-action@v3
+      with:
+        install: true
+        buildkitd-config: /tmp/buildkitd.toml
+    - name: Inject slug/short variables
+      uses: rlespinasse/github-slug-action@v4.4.1
+    - name: Login to internal Container Registry
+      # if: github.event_name != 'pull_request'
+      uses: docker/login-action@v3
+      with:
+        username: ${{ secrets.REGISTRY_USERNAME }}
+        password: ${{ secrets.REGISTRY_PASSWORD }}
+        registry: registry.internal.huggingface.tech
+    - name: Push to docker
+      run: |
+        if [ "${{ github.event_name }}" = "pull_request" ]; then
+          export TAG=nix-sha-${{ env.GITHUB_SHA_SHORT }}
+        else
+          export TAG=${{ github.ref_name }}-nix
+        fi
+        export IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:$TAG
+        nix-shell -p skopeo --command "skopeo --insecure-policy copy docker-archive:$(readlink -f ./result) docker://$IMAGE --dest-compress-format zstd"
--- a/.github/workflows/nix_cache.yaml
+++ b/.github/workflows/nix_cache.yaml
@ -0,0 +1,34 @@
+name: "Cache devshells"
+on:
+  pull_request:
+    paths:
+      - "flake.nix"
+      - "flake.lock"
+      - "nix/**"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tests:
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    steps:
+      - uses: actions/checkout@v4
+      - uses: cachix/install-nix-action@v27
+        with:
+          nix_path: nixpkgs=channel:nixos-unstable
+      - uses: cachix/cachix-action@v14
+        with:
+          name: huggingface
+          # If you chose signing key for write access
+          authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
+        env:
+          USER: github_runner
+      - name: Build impure devshell
+        run: nix build .\#devShells.x86_64-linux.impure
+      - name: Build impure devshell (CUDA dev)
+        run: nix build .\#devShells.x86_64-linux.impureWithCuda
+      # Pure shell dependencies are covered by Nix tests.
+      # - name: Build pure devshell
+      #   run: nix build .\#devShells.x86_64-linux.pure
--- a/.github/workflows/nix_tests.yaml
+++ b/.github/workflows/nix_tests.yaml
@ -0,0 +1,42 @@
+name: "Nix Tests"
+on:
+  pull_request:
+    paths:
+      - ".github/workflows/nix_tests.yaml"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "backends/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  tests:
+    runs-on:
+      group: aws-highmemory-32-plus-priv
+    steps:
+    - uses: actions/checkout@v4
+    - uses: cachix/install-nix-action@v27
+      with:
+        nix_path: nixpkgs=channel:nixos-unstable
+    - uses: cachix/cachix-action@v14
+      with:
+        name: huggingface
+        # If you chose signing key for write access
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+      env:
+        USER: github_runner
+    - name: Build
+      run: nix develop .#test --command echo "Ok"
+    - name: Pre-commit tests.
+      run: nix develop .#test --command pre-commit run --all-files
+    - name: Python tests.
+      run: nix develop .#test --command python -m pytest server/tests/
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    - name: Rust tests.
+      run: nix develop .#test --command cargo test
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -8,6 +8,7 @@ on:
      - "proto/**"
      - "router/**"
      - "launcher/**"
+      - "backends/**"
      - "Cargo.lock"
      - "rust-toolchain.toml"

@ -17,66 +18,50 @@ concurrency:

 jobs:
  run_tests:
-    runs-on: ubuntu-latest
-
-    env:
-      SCCACHE_GHA_ENABLED: "on"
-      RUSTC_WRAPPER: /usr/local/bin/sccache
-      SCCACHE: 0.3.3
-
+    runs-on:
+      group: aws-highmemory-32-plus-priv
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
      - name: Set up Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
+        id: python
        with:
-          python-version: 3.9
-      - name: Install Rust
-        uses: actions-rs/toolchain@v1
+          python-version: 3.11
+      - uses: dtolnay/rust-toolchain@1.85.0
        with:
-          toolchain: 1.71.0
-          override: true
          components: rustfmt, clippy
      - name: Install Protoc
        uses: arduino/setup-protoc@v1
-      - name: Install sccache
+      - name: Clean unused files
        run: |
-          curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
-          chmod +x /usr/local/bin/sccache
-      - name: configure sccache
-        uses: actions/github-script@v6
-        with:
-          script: |
-            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-            core.exportVariable('SCCACHE_GHA_CACHE_TO', 'sccache-${{runner.os}}-${{github.ref_name}}');
-            core.exportVariable('SCCACHE_GHA_CACHE_FROM', 'sccache-${{runner.os}}-main,sccache-${{runner.os}}-');
-      - name: cargo registry cache
-        uses: actions/cache@v3
-        with:
-          key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-${{ github.sha }}
-          restore-keys: |
-            cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-
-            cargo-${{ runner.os }}-
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
      - name: Install
        run: |
-          make install
+          sudo apt update
+          sudo apt install python3.11-dev -y
+          pip install -U pip uv
+          uv venv
+          source ./.venv/bin/activate
+          make install-cpu
+      - name: Download locked kernels
+        run: |
+          source ./.venv/bin/activate
+          kernels download server
      - name: Run server tests
        run: |
-          pip install pytest
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          source ./.venv/bin/activate
+          uv pip install pytest
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv server/tests
-      - name: Run Rust fmt
+      - name: Pre-commit checks
        run: |
-          cargo fmt --check
-      - name: Run Rust clippy
-        run: |
-          cargo clippy
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
      - name: Run Rust tests
        run: |
          cargo test
-      - name: sccache stats
+      - name: Run Rust tests with google feature
        run: |
-          /usr/local/bin/sccache --show-stats
+          cargo test --features google
--- a/.github/workflows/trufflehog.yaml
+++ b/.github/workflows/trufflehog.yaml
@ -0,0 +1,21 @@
+on:
+  push:
+
+name: Secret Leaks
+
+permissions:
+  contents: read
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Secret Scanning
+        uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d
+        with:
+          # exclude buggy postgres detector that is causing false positives and not relevant to our codebase
+          extra_args: --results=verified,unknown --exclude-detectors=postgres
--- a/.github/workflows/upload_pr_documentation.yaml
+++ b/.github/workflows/upload_pr_documentation.yaml
@ -13,4 +13,4 @@ jobs:
      package_name: text-generation-inference
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,30 @@
 target
 router/tokenizer.json
 *__pycache__*
+
+backends/v2/src/client/pb
+backends/v3/src/client/pb
+backends/client/src/v2/pb
+backends/client/src/v3/pb
+
+# ROCm auto-generated files
+*.hip
+server/exllamav2
+server/exllama_kernels/exllama_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip_func/
+*_hip.cuh
+server/exllama_kernels/exllama_kernels/hip_buffers.cuh
+server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
+
+data/
+load_tests/*.json
+server/fbgemmm
+
+.direnv/
+.venv/
+
+# Gaudi auto-generated files
+hl-smi_log*.txt
+.graph_dumps
+out
+hqt_output
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,24 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+        exclude: crate-hashes.json
+    -   id: trailing-whitespace
+        exclude: docs/source/reference/launcher.md
+-   repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/doublify/pre-commit-rust
+    rev: v1.0
+    hooks:
+    -   id: cargo-check
+    -   id: fmt
+    -   id: clippy
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.0
+    hooks:
+    -   id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
--- a/.redocly.lint-ignore.yaml
+++ b/.redocly.lint-ignore.yaml
@ -0,0 +1,82 @@
+# This file instructs Redocly's linter to ignore the rules contained for specific parts of your API.
+# See https://redoc.ly/docs/cli/ for more information.
+docs/openapi.json:
+  no-empty-servers:
+    - '#/openapi'
+  spec:
+    - >-
+      #/components/schemas/GenerateParameters/properties/best_of/exclusiveMinimum
+    - >-
+      #/components/schemas/GenerateParameters/properties/frequency_penalty/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/grammar/nullable'
+    - >-
+      #/components/schemas/GenerateParameters/properties/repetition_penalty/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/seed/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/temperature/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/top_k/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/top_n_tokens/exclusiveMinimum
+    - '#/components/schemas/GenerateParameters/properties/top_p/exclusiveMinimum'
+    - >-
+      #/components/schemas/GenerateParameters/properties/typical_p/exclusiveMinimum
+    - '#/components/schemas/GenerateResponse/properties/details/nullable'
+    - '#/components/schemas/StreamResponse/properties/details/nullable'
+    - '#/components/schemas/ChatRequest/properties/response_format/nullable'
+    - '#/components/schemas/ChatRequest/properties/stream_options/nullable'
+    - '#/components/schemas/ChatRequest/properties/tool_choice/nullable'
+    - '#/components/schemas/ToolChoice/nullable'
+    - '#/components/schemas/ChatCompletionComplete/properties/logprobs/nullable'
+    - '#/components/schemas/ChatCompletionChunk/properties/usage/nullable'
+    - '#/components/schemas/ChatCompletionChoice/properties/logprobs/nullable'
+  no-invalid-media-type-examples:
+    - '#/paths/~1/post/responses/422/content/application~1json/example'
+    - '#/paths/~1/post/responses/424/content/application~1json/example'
+    - '#/paths/~1/post/responses/429/content/application~1json/example'
+    - '#/paths/~1/post/responses/500/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/422/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/424/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/429/content/application~1json/example'
+    - '#/paths/~1generate/post/responses/500/content/application~1json/example'
+    - >-
+      #/paths/~1generate_stream/post/responses/422/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/424/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/429/content/text~1event-stream/example
+    - >-
+      #/paths/~1generate_stream/post/responses/500/content/text~1event-stream/example
+    - '#/paths/~1tokenize/post/responses/404/content/application~1json/example'
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/422/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/424/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/429/content/application~1json/example
+    - >-
+      #/paths/~1v1~1chat~1completions/post/responses/500/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/422/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/424/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/429/content/application~1json/example
+    - >-
+      #/paths/~1v1~1completions/post/responses/500/content/application~1json/example
+  operation-4xx-response:
+    - '#/paths/~1health/get/responses'
+    - '#/paths/~1info/get/responses'
+    - '#/paths/~1metrics/get/responses'
+  no-unused-components:
+    - '#/components/schemas/Completion'
+  security-defined:
+    - '#/paths/~1/post'
+    - '#/paths/~1generate/post'
+    - '#/paths/~1generate_stream/post'
+    - '#/paths/~1health/get'
+    - '#/paths/~1info/get'
+    - '#/paths/~1metrics/get'
+    - '#/paths/~1tokenize/post'
+    - '#/paths/~1v1~1chat~1completions/post'
+    - '#/paths/~1v1~1completions/post'
+    - '#/paths/~1v1~1models/get'
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,133 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,120 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Contribute to text-generation-inference
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+contributions are not the only way to help the community. Answering questions, helping
+others, and improving the documentation are also immensely valuable.
+
+It also helps us if you spread the word! Reference the library in blog posts
+about the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply ⭐️ the repository to say thank you.
+
+However you choose to contribute, please be mindful and respect our
+[code of conduct](https://github.com/huggingface/text-generation-inference/blob/main/CODE_OF_CONDUCT.md).
+
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
+
+## Ways to contribute
+
+There are several ways you can contribute to text-generation-inference.
+
+* Fix outstanding issues with the existing code.
+* Submit issues related to bugs or desired new features.
+* Contribute to the examples or to the documentation.
+
+> All contributions are equally valuable to the community. 🥰
+
+## Fixing outstanding issues
+
+If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open
+a Pull Request!
+
+## Submitting a bug-related issue or feature request
+
+Do your best to follow these guidelines when submitting a bug-related issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter.
+
+Before you report an issue, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the
+library itself, and not your code.
+
+Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so
+we can quickly resolve it:
+
+* Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies).
+* A short, self-contained, code snippet that allows us to reproduce the bug.
+* The *full* traceback if an exception is raised.
+* Attach any other additional information, like screenshots, you think may help.
+
+To get the OS and software versions automatically, you can re-run the launcher with the `--env` flag:
+
+```bash
+text-generation-launcher --env
+```
+
+This will precede the launch of the model with the information relative to your environment. We recommend pasting
+that in your issue report.
+
+### Do you want a new feature?
+
+If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe:
+
+1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it
+   a feature related to something you need for a project? Is it something you worked on and think it could benefit
+   the community?
+
+   Whatever it is, we'd love to hear about it!
+
+2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better
+   we'll be able to help you.
+3. Provide a *code snippet* that demonstrates the feature's usage.
+4. If the feature is related to a paper, please include a link.
+
+If your issue is well written we're already 80% of the way there by the time you create it.
+
+We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE)
+to help you get started with your issue.
+
+## Do you want to implement a new model?
+
+New models are constantly released and if you want to implement a new model, please provide the following information:
+
+* A short description of the model and a link to the paper.
+* Link to the implementation if it is open-sourced.
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can help you add it to text-generation-inference!
+
+## Do you want to add documentation?
+
+We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know
+how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be
+happy to make the changes or help you make a contribution if you're interested!
+
+## I want to become a maintainer of the project. How do I get there?
+
+TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have
+motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference
+service.
+
+If you are such an individual (or organization), please reach out to us and let's collaborate.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,20 +1,54 @@
 [workspace]
 members = [
    "benchmark",
-    "router",
-    "router/client",
-    "router/grpc-metadata",
-    "launcher"
+    "backends/v2",
+    "backends/v3",
+    "backends/grpc-metadata",
+    "backends/trtllm",
+    "backends/llamacpp",
+    "launcher",
+    "router"
 ]
+default-members = [
+    "benchmark",
+    "backends/v2",
+    "backends/v3",
+    "backends/grpc-metadata",
+    # "backends/trtllm",
+    "launcher",
+    "router"
+]
+resolver = "2"

 [workspace.package]
-version = "1.1.1"
+version = "3.3.4-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"

+[workspace.dependencies]
+base64 = "0.22.0"
+tokenizers = { version = "0.20.0", features = ["http"] }
+hf-hub = { version = "0.4.2", features = ["tokio"] }
+metrics = { version = "0.23.0" }
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }
+minijinja = { version = "2.2.0", features = ["json"] }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
+
 [profile.release]
+incremental = true
+
+[profile.release-binary]
+inherits = "release"
 debug = 1
 incremental = true
-lto = "off"
 panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
+lto = "fat"
+opt-level = 3
+codegen-units = 1
--- a/236
+++ b/236
@ -1,23 +1,25 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
 WORKDIR /usr/src

 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

-FROM chef as planner
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
+
 RUN cargo chef prepare --recipe-path recipe.json

 FROM chef AS builder

-ARG GIT_SHA
-ARG DOCKER_LABEL
-
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@ -25,32 +27,34 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    rm -f $PROTOC_ZIP

 COPY --from=planner /usr/src/recipe.json recipe.json
-RUN cargo chef cook --release --recipe-path recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json

+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY benchmark benchmark
 COPY router router
+COPY backends backends
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt --frozen

 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM debian:bullseye-slim as pytorch-install
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
+WORKDIR /usr/src/
+
+# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
+ARG PYTORCH_VERSION=2.7
+ARG PYTHON_VERSION=3.11

-ARG PYTORCH_VERSION=2.0.1
-ARG PYTHON_VERSION=3.9
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=11.8
-ARG MAMBA_VERSION=23.1.0-1
-ARG CUDA_CHANNEL=nvidia
-ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
 ARG TARGETPLATFORM

-ENV PATH /opt/conda/bin:$PATH
-
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
@ -58,104 +62,89 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        curl \
        git && \
        rm -rf /var/lib/apt/lists/*
-
-# Install conda
-# translating Docker's TARGETPLATFORM into mamba arches
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
-         *)              MAMBA_ARCH=x86_64   ;; \
-    esac && \
-    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
-RUN chmod +x ~/mambaforge.sh && \
-    bash ~/mambaforge.sh -b -p /opt/conda && \
-    rm ~/mambaforge.sh
-
-# Install pytorch
-# On arm64 we exit with an error code
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  exit 1 ;; \
-         *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
-    esac && \
-    /opt/conda/bin/conda clean -ya
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+ENV PATH="$PATH:/root/.local/bin"
+RUN uv python install ${PYTHON_VERSION}
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} torchvision pip setuptools packaging
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"

 # CUDA kernels builder image
-FROM pytorch-install as kernel-builder
+FROM pytorch-install AS kernel-builder
+
+ARG MAX_JOBS=8
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"

 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        ninja-build \
+        ninja-build cmake \
        && rm -rf /var/lib/apt/lists/*

-RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
-    /opt/conda/bin/conda clean -ya
-
 # Build Flash Attention CUDA kernels
-FROM kernel-builder as flash-att-builder
+FROM kernel-builder AS flash-att-builder

 WORKDIR /usr/src

 COPY server/Makefile-flash-att Makefile

 # Build specific version of flash attention
-RUN make build-flash-attention
+RUN . .venv/bin/activate && make build-flash-attention

 # Build Flash Attention v2 CUDA kernels
-FROM kernel-builder as flash-att-v2-builder
+FROM kernel-builder AS flash-att-v2-builder

 WORKDIR /usr/src

 COPY server/Makefile-flash-att-v2 Makefile

 # Build specific version of flash attention v2
-RUN make build-flash-attention-v2
+RUN . .venv/bin/activate && make build-flash-attention-v2-cuda

 # Build Transformers exllama kernels
-FROM kernel-builder as exllama-kernels-builder
+FROM kernel-builder AS exllama-kernels-builder
 WORKDIR /usr/src
 COPY server/exllama_kernels/ .
+
+RUN . .venv/bin/activate && python setup.py build
+
+# Build Transformers exllama kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-exllamav2/ Makefile
+
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN . .venv/bin/activate && make build-exllamav2

 # Build Transformers awq kernels
-FROM kernel-builder as awq-kernels-builder
+FROM kernel-builder AS awq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
-
-# Build eetq kernels
-FROM kernel-builder as eetq-kernels-builder
-WORKDIR /usr/src
-COPY server/Makefile-eetq Makefile
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+RUN . .venv/bin/activate && make build-awq

 # Build Transformers CUDA kernels
-FROM kernel-builder as custom-kernels-builder
+FROM kernel-builder AS custom-kernels-builder
 WORKDIR /usr/src
 COPY server/custom_kernels/ .
 # Build specific version of transformers
-RUN python setup.py build
-
-# Build vllm CUDA kernels
-FROM kernel-builder as vllm-builder
+RUN . .venv/bin/activate && python setup.py build

+# Build mamba kernels
+FROM kernel-builder AS mamba-builder
 WORKDIR /usr/src
+COPY server/Makefile-selective-scan Makefile
+RUN . .venv/bin/activate && make build-all

-COPY server/Makefile-vllm Makefile
-
-# Build specific version of vllm
-RUN make build-vllm
+# Build flashinfer
+FROM kernel-builder AS flashinfer-builder
+WORKDIR /usr/src
+COPY server/Makefile-flashinfer Makefile
+RUN . .venv/bin/activate && make install-flashinfer

 # Text Generation Inference base image
-FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
-
-# Conda env
-ENV PATH=/opt/conda/bin:$PATH \
-    CONDA_PREFIX=/opt/conda
+FROM nvidia/cuda:12.4.0-base-ubuntu22.04 AS base

 # Text Generation Inference base env
-ENV HUGGINGFACE_HUB_CACHE=/data \
+ENV HF_HOME=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

@ -166,57 +155,84 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        ca-certificates \
        make \
        curl \
+        git \
        && rm -rf /var/lib/apt/lists/*

-# Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
-
-# Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-# Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-# Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
+# RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# ENV PATH="$PATH:/root/.local/bin"
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
 # Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
+# RUN pip install einops --no-cache-dir
+
+# Copy env with PyTorch installed
+COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv
+ENV PYTHON_VERSION=3.11
+RUN uv python install ${PYTHON_VERSION}
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"

 # Install server
 COPY proto proto
 COPY server server
 COPY server/Makefile server/Makefile
+ENV HF_KERNELS_CACHE=/kernels
 RUN cd server && \
-    make gen-server && \
-    pip install -r requirements.txt && \
-    pip install ".[bnb, accelerate, quantize]" --no-cache-dir
+	uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --no-install-project --active && \
+    make gen-server-raw && \
+    kernels download .

-# Install benchmarker
-COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
-# Install router
-COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
-# Install launcher
-COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+RUN cd server && \
+    uv sync --frozen --extra gen --extra bnb --extra accelerate --extra compressed-tensors --extra quantize --extra peft --extra outlines --extra torch --active --python=${PYTHON_VERSION} && \
+    uv pip install nvidia-nccl-cu12==2.25.1 && \
+    pwd && \
+    text-generation-server --help

+# Copy build artifacts from flash attention builder
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /usr/src/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/.venv/lib/python3.11/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /usr/src/.venv/lib/python3.11/site-packages/flashinfer/ /usr/src/.venv/lib/python3.11/site-packages/flashinfer/
+
+
+# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
+# Required to find libpython within the rust binaries
+# This is needed because exl2 tries to load flash-attn
+# And fails with our builds.
+ENV EXLLAMA_NO_FLASH_ATTN=1
+
+# Deps before the binaries
+# The binaries change on every build given we burn the SHA into them
+# The deps change less often.
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

-# AWS Sagemaker compatbile image
-FROM base as sagemaker
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker

 COPY sagemaker-entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
@ -226,5 +242,9 @@ ENTRYPOINT ["./entrypoint.sh"]
 # Final image
 FROM base

-ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/"
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+# CMD ["--json-output"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -0,0 +1,166 @@
+# Fetch and extract the TGI sources
+FROM alpine AS tgi
+RUN mkdir -p /tgi
+
+# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
+FROM alpine AS optimum-neuron
+RUN mkdir -p /optimum-neuron
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.0.tar.gz /optimum-neuron/sources.tar.gz
+RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
+
+# Build cargo components (adapted from TGI original Dockerfile)
+# Note: we cannot use the cargo-chef base image as it uses python 3.11
+FROM ubuntu:22.04 AS chef
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    curl ca-certificates build-essential \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN cargo install cargo-chef --locked
+
+WORKDIR /usr/src
+
+FROM chef AS planner
+COPY backends/neuron/Cargo.toml Cargo.toml
+COPY Cargo.lock Cargo.lock
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN apt-get update -y \
+ && apt-get install -y --no-install-recommends \
+    unzip python3-dev libssl-dev pkg-config \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY backends/neuron/Cargo.toml Cargo.toml
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
+
+COPY Cargo.lock Cargo.lock
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo build --release
+
+# Python base image
+FROM ubuntu:22.04 AS base
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    python3-pip \
+    python3-setuptools \
+    python-is-python3 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+RUN pip3 --no-cache-dir install --upgrade pip
+
+# Python server build image
+FROM base AS pyserver
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    make \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN install -d /pyserver
+WORKDIR /pyserver
+COPY backends/neuron/server server
+COPY proto proto
+RUN pip3 install -r server/build-requirements.txt
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package
+
+# Neuron base image (used for deployment)
+FROM base AS neuron
+
+# Install system prerequisites
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    gnupg2 \
+    wget \
+    python3-dev \
+    libexpat1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
+
+# Install neuronx packages
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+    aws-neuronx-dkms=2.20.28.0 \
+    aws-neuronx-collectives=2.24.59.0-838c7fc8b \
+    aws-neuronx-runtime-lib=2.24.53.0-f239092cc \
+    aws-neuronx-tools=2.22.61.0 \
+    libxml2 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
+
+# Install manually torch CPU version to avoid pulling CUDA
+RUN pip3 install \
+    torch==2.5.1 \
+    torchvision==0.20.1 \
+    --index-url https://download.pytorch.org/whl/cpu
+
+RUN pip3 install \
+    neuronx-cc==2.17.194.0 \
+    torch-neuronx==2.5.1.2.6.0 \
+    neuronx-distributed==0.11.0 \
+    libneuronxla==2.2.1630.0 \
+    --extra-index-url=https://pip.repos.neuron.amazonaws.com
+
+# Install HuggingFace packages
+RUN pip3 install \
+    hf_transfer huggingface_hub
+
+# Install optimum-neuron
+COPY --from=optimum-neuron /optimum-neuron optimum-neuron
+RUN pip3 install ./optimum-neuron
+
+# TGI base env
+ENV HUGGINGFACE_HUB_CACHE=/tmp \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Disable color logs as they are not supported by CloudWatch
+ENV LOGURU_COLORIZE=NO
+ENV LOG_COLORIZE=0
+
+# Install router
+COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+# Install python server
+COPY --from=pyserver /pyserver/build/dist dist
+RUN pip install dist/text_generation_server*.tar.gz
+
+# Final image
+FROM neuron
+
+COPY backends/neuron/tgi_entry_point.py /tgi_entry_point.py
+COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
--- a/Dockerfile.nix
+++ b/Dockerfile.nix
@ -0,0 +1,24 @@
+# Build the image and get out the docker file:
+#
+# docker build -t tgi-nix-builder -f Dockerfile.nix
+# docker run --log-driver=none tgi-nix-builder | docker load
+
+FROM nixos/nix:2.18.8 AS builder
+RUN echo "experimental-features = nix-command flakes" >> /etc/nix/nix.conf
+RUN nix profile install nixpkgs#cachix
+RUN cachix use huggingface
+WORKDIR /root
+ADD . .
+RUN nix build .
+RUN mkdir /tmp/nix-store-closure
+RUN cp -R $(nix-store -qR result/) /tmp/nix-store-closure
+
+FROM ubuntu:24.04
+
+WORKDIR /app
+
+# Copy /nix/store
+COPY --from=builder /tmp/nix-store-closure /nix/store
+COPY --from=builder /root/result /app
+RUN ldconfig
+CMD ["ldconfig", "/app/bin/text-generation-launcher"]
--- a/314
+++ b/314
@ -0,0 +1,314 @@
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo build --profile release-opt --frozen
+
+FROM rocm/dev-ubuntu-22.04:6.3.1-complete AS base
+
+ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="3a585126"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.11
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        ccache \
+        curl \
+        git \
+        ninja-build \
+        cmake \
+        software-properties-common \
+        python3.11-dev \
+        python3.11-venv && \
+        rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+ENV PATH="$PATH:/root/.local/bin"
+RUN uv python install ${PYTHON_VERSION}
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install pip setuptools packaging
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"
+
+RUN . .venv/bin/activate && pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN . .venv/bin/activate && cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN . .venv/bin/activate && cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+
+FROM base AS build_rccl
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN . .venv/bin/activate && cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
+
+FROM base AS build_triton
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN . .venv/bin/activate && cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+
+FROM base AS build_amdsmi
+RUN . .venv/bin/activate && cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
+
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN . .venv/bin/activate && cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN . .venv/bin/activate && cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN . .venv/bin/activate && cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
+
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    . .venv/bin/activate && \
+    pip install /install/*.whl
+
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN . .venv/bin/activate && cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
+
+RUN rm -rf /var/lib/apt/lists/*
+
+FROM final AS kernel-builder
+# # Build vllm kernels
+FROM kernel-builder AS vllm-builder
+
+COPY server/Makefile-vllm Makefile
+RUN . .venv/bin/activate && pip install setuptools_scm
+
+# Build specific version of vllm
+RUN . .venv/bin/activate && make build-vllm-rocm
+
+# Build Transformers CUDA kernels (gpt-neox and bloom)
+FROM kernel-builder AS custom-kernels-builder
+COPY server/custom_kernels/ .
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
+
+# Build exllama kernels
+FROM kernel-builder AS exllama-kernels-builder
+COPY server/exllama_kernels/ .
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
+
+# Build exllama v2 kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+COPY server/exllamav2_kernels/ .
+RUN . .venv/bin/activate && python3 setup.py bdist_wheel --dist-dir=dist
+
+FROM kernel-builder AS marlin-kernels
+ENV MARLIN_KERNELS_BRANCH=v0.3.6
+ENV VLLM_TARGET_DEVICE=rocm
+RUN . .venv/bin/activate && git clone https://github.com/danieldk/marlin-kernels.git && \
+    cd marlin-kernels && \
+    git checkout ${MARLIN_KERNELS_BRANCH} && \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+FROM kernel-builder AS moe-kernels
+ENV MOE_KERNELS_BRANCH=v0.8.2
+ENV VLLM_TARGET_DEVICE=rocm
+RUN . .venv/bin/activate && git clone https://github.com/danieldk/moe-kernels.git && \
+    cd moe-kernels && \
+    git checkout ${MOE_KERNELS_BRANCH} && \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+FROM final AS base-copy
+
+# Text Generation Inference base env
+ENV HF_HOME=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+ENV VIRTUAL_ENV=/app/.venv/
+ENV PATH="$PATH:/app/.venv/bin/"
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    uv pip install grpcio-tools mypy-protobuf && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir && \
+    make gen-server-raw
+RUN cd server && \
+    pwd && \
+    text-generation-server --help
+
+RUN --mount=type=bind,from=vllm-builder,src=/app/vllm/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=custom-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=exllama-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=exllamav2-kernels-builder,src=/app/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=marlin-kernels,src=/app/marlin-kernels/dist,target=/install \
+    uv pip install /install/*.whl
+RUN --mount=type=bind,from=moe-kernels,src=/app/moe-kernels/dist,target=/install \
+    uv pip install /install/*.whl
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
+
+COPY sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+# Final image
+FROM base-copy
+
+# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
+ENV HIP_FORCE_DEV_KERNARG=1
+
+# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
+# However, Triton requires a tunning for each prompt length, which is prohibitive.
+ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
+ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
+ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
+ENV VLLM_MOE_PADDING=0
+ENV ATTENTION=paged
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
+ENV ROCM_USE_SKINNY_GEMM=1
+
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib"
+ENV PYTHONPATH=/app/.venv/lib/python3.11/site-packages
+# CMD ["--json-output"]
--- a/129
+++ b/129
@ -0,0 +1,129 @@
+# Those arguments are required to build the image
+ARG HABANA_VERSION=1.21.0
+ARG PYTORCH_VERSION=2.6.0
+
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+ENV PYO3_PYTHON="/root/.local/bin/python" \
+    PYTHON_SYS_EXECUTABLE="/root/.local/bin/python" \
+    PYO3_PYTHON_VERSION="3.10"
+
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && . $HOME/.local/bin/env \
+    && uv python install 3.10 --default --preview \
+    && test -f /root/.local/bin/python || (echo "Python 3.10 not found at /root/.local/bin/python" && exit 1)
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+# Text Generation Inference base image
+ARG HABANA_VERSION
+ARG PYTORCH_VERSION
+
+FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base
+
+ENV ATTENTION=paged
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
+ENV PT_HPU_LAZY_MODE=1
+ENV PT_HPU_WEIGHT_SHARING=0
+ENV VLLM_EXPONENTIAL_BUCKETING=true
+
+# Text Generation Inference base env
+ENV HF_HOME=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Assert that Python 3.10 is installed as the launcher is compiled with Python 3.10
+RUN python3.10 --version || (echo "Python 3.10 is not installed" && exit 1)
+
+# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+WORKDIR /usr/src
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        libssl-dev \
+        ca-certificates \
+        make \
+        curl \
+        git \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install server
+COPY proto proto
+COPY backends/gaudi/server server
+COPY backends/gaudi/server/Makefile server/Makefile
+ARG HABANA_VERSION
+RUN cd server && \
+    make gen-server && \
+    pip install --no-deps -r requirements.txt && \
+    bash ./dill-0.3.8-patch.sh && \
+    pip install . --no-cache-dir
+RUN pip install git+https://github.com/sywangyi/vllm-hpu-extension.git@bmax_fix
+RUN pip install compressed-tensors==0.9.1
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
+
+COPY sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+# Final image
+FROM base
+
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+ENV HABANA_VISIBLE_DEVICES all
+ENV OMPI_MCA_btl_vader_single_copy_mechanism NONE
+
+COPY backends/gaudi/tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+CMD ["--json-output"]
--- a/223
+++ b/223
@ -0,0 +1,223 @@
+ARG PLATFORM=xpu
+
+FROM lukemathwalker/cargo-chef:latest-rust-1.85.1 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY backends backends
+COPY launcher launcher
+RUN cargo build --profile release-opt --frozen
+
+
+# Text Generation Inference base image for Intel
+
+FROM intel/oneapi-basekit:2025.0.1-0-devel-ubuntu22.04 AS xpu
+
+USER root
+
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTHON_VERSION='3.11.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH=/opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" > /tmp/intel-for-pytorch-gpu-dev.list
+
+RUN mv /tmp/intel-for-pytorch-gpu-dev.list /etc/apt/sources.list.d
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y xpu-smi cmake ninja-build pciutils intel-ocloc libnl-genl-3-200
+
+# Text Generation Inference base env
+ENV HF_HOME=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+
+
+
+WORKDIR /usr/src
+
+RUN pip install torch==2.7.0 torchvision==0.22.0 --index-url https://download.pytorch.org/whl/xpu
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
+RUN cd server && \
+    make gen-server && \
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+ENV CCL_ZE_IPC_EXCHANGE=sockets
+ENV TORCH_LLM_ALLREDUCE=1
+ENV CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0
+ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.7.0%2Bxpu-cp311-cp311-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/intel_extension_for_pytorch-2.7.10%2Bxpu-cp311-cp311-linux_x86_64.whl
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# Text Generation Inference base image for Intel-cpu
+FROM ubuntu:22.04 AS cpu
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    curl \
+    ca-certificates \
+    make \
+    g++-12 \
+    gcc-12 \
+    git \
+    wget \
+    cmake \
+    libnuma-dev
+
+RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 12
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+RUN update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30
+RUN update-alternatives --set cc /usr/bin/gcc
+
+RUN update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
+RUN update-alternatives --set c++ /usr/bin/g++
+
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTHON_VERSION='3.11.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH=/opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+RUN conda install -c conda-forge gperftools mkl
+
+RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cpu
+RUN pip install triton==3.2.0 py-libnuma
+
+WORKDIR /usr/src
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/intel_extension_for_pytorch-2.7.0%2Bcpu-cp311-cp311-linux_x86_64.whl
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/cpu/oneccl_bind_pt-2.7.0%2Bcpu-cp311-cp311-linux_x86_64.whl
+
+
+ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
+ENV CCL_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch
+ENV I_MPI_ROOT=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch
+ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
+ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.11/site-packages/oneccl_bindings_for_pytorch/lib
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+ENV UV_SYSTEM_PYTHON=1
+RUN cd server && \
+    make gen-server && \
+    pip install -U pip uv && \
+    uv pip install -e ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+FROM ${PLATFORM} AS final
+ENV ATTENTION=flashdecoding-ipex
+ENV PREFIX_CACHING=1
+ENV PREFILL_CHUNKING=1
+ENV CUDA_GRAPHS=0
+ENTRYPOINT ["text-generation-launcher"]
+CMD ["--json-output"]
--- a/88
+++ b/88
@ -0,0 +1,88 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4827
+ARG llamacpp_cuda=OFF
+ARG llamacpp_native=ON
+ARG llamacpp_cpu_arm_arch=native
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggml-org/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN mkdir -p llama.cpp \
+ && tar -xzf ${llamacpp_version}.tar.gz -C llama.cpp --strip-components=1 \
+ && cd llama.cpp \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DGGML_NATIVE=${llamacpp_native} \
+    -DGGML_CPU_ARM_ARCH=${llamacpp_cpu_arm_arch} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain 1.85.1 --profile minimal -y
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+WORKDIR /app
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt upgrade -y && apt install -y \
+    python3-venv \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+COPY --from=builder /opt/src/llama.cpp/gguf-py gguf-py
+COPY --from=builder /opt/src/llama.cpp/convert_hf_to_gguf.py /bin/
+
+RUN pip3 install --no-cache-dir \
+    -r requirements.txt \
+    -e gguf-py
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
--- a/158
+++ b/158
@ -0,0 +1,158 @@
+ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real;100-real;120-real"
+ARG cuda_base=12.8.0
+ARG build_type=release
+ARG ompi_version=4.1.7
+ARG sccache_gha_enabled=off
+ARG actions_results_url=""
+ARG actions_runtime_token=""
+
+# CUDA dependent dependencies resolver stage
+FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    build-essential \
+    cmake \
+    curl \
+    gcc-14  \
+    g++-14 \
+    git \
+    git-lfs \
+    lld \
+    libssl-dev \
+    libucx-dev \
+    libasan8 \
+    libubsan1 \
+    ninja-build \
+    pkg-config \
+    pipx \
+    python3 \
+    python3-dev \
+    python3-setuptools \
+    tar \
+    wget --no-install-recommends && \
+    pipx ensurepath
+
+ENV TGI_INSTALL_PREFIX=/usr/local/tgi
+ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
+
+# Install OpenMPI
+FROM cuda-builder AS mpi-builder
+WORKDIR /opt/src/mpi
+
+ARG ompi_version
+ENV OMPI_VERSION=${ompi_version}
+ENV OMPI_TARBALL_FILENAME=openmpi-${OMPI_VERSION}.tar.bz2
+ADD --checksum=sha256:54a33cb7ad81ff0976f15a6cc8003c3922f0f3d8ceed14e1813ef3603f22cd34 \
+    https://download.open-mpi.org/release/open-mpi/v4.1/${OMPI_TARBALL_FILENAME} .
+
+RUN tar --strip-components=1 -xf ${OMPI_TARBALL_FILENAME} &&\
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
+    make -j all && \
+    make install && \
+    rm -rf ${OMPI_TARBALL_FILENAME}/..
+
+# Install TensorRT
+FROM cuda-builder AS trt-builder
+COPY backends/trtllm/scripts/install_tensorrt.sh /opt/install_tensorrt.sh
+RUN chmod +x /opt/install_tensorrt.sh && \
+    /opt/install_tensorrt.sh
+
+# Build Backend
+FROM cuda-builder AS tgi-builder
+WORKDIR /usr/src/text-generation-inference
+
+# Scoped global args reuse
+ARG cuda_arch_list
+ARG build_type
+ARG sccache_gha_enabled
+ARG actions_results_url
+ARG actions_runtime_token
+
+# Install Rust
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.1 --profile minimal -y && \
+    chmod -R a+w /root/.rustup && \
+    chmod -R a+w /root/.cargo && \
+    cargo install sccache --version ">=0.10.0" --locked
+
+ENV LD_LIBRARY_PATH="/usr/local/mpi/lib:$LD_LIBRARY_PATH"
+ENV PKG_CONFIG_PATH="/usr/local/mpi/lib/pkgconfig"
+ENV CMAKE_PREFIX_PATH="/usr/local/mpi:/usr/local/tensorrt"
+
+ENV USE_LLD_LINKER=ON
+ENV CUDA_ARCH_LIST=${cuda_arch_list}
+
+# SCCACHE Specifics args - before finding a better, more generic, way...
+ENV SCCACHE_GHA_ENABLED=${sccache_gha_enabled}
+ENV ACTIONS_RESULTS_URL=${actions_results_url}
+ENV ACTIONS_RUNTIME_TOKEN=${actions_runtime_token}
+
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY router router
+COPY backends backends
+COPY benchmark benchmark
+COPY launcher launcher
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+
+ENV RUSTC_WRAPPER=sccache
+ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX
+RUN export CC=gcc-14 \
+    export CXX=g++-14 \
+    export CMAKE_C_COMPILER_LAUNCHER=sccache && \
+    export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \
+    export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \
+    mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
+    cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
+    sccache --show-stats
+
+FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS runtime
+RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
+    pipx ensurepath && \
+    pipx install --include-deps transformers tokenizers
+
+WORKDIR /usr/local/tgi/bin
+
+ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
+
+# This is used only for the CI/CD
+FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS ci-runtime
+RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
+    pipx ensurepath && \
+    pipx install --include-deps transformers tokenizers
+
+WORKDIR /usr/local/tgi/bin
+
+ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
+
+COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
+COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
+COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
+
+# Basically we copy from target/debug instead of target/release
+COPY --from=tgi-builder /usr/src/text-generation-inference/target/debug/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
+
+# This is the final image
+FROM runtime
+
+LABEL co.huggingface.vendor="Hugging Face Inc."
+LABEL org.opencontainers.image.authors="hardware@hf.co"
+LABEL org.opencontainers.title="Text-Generation-Inference TensorRT-LLM Backend"
+
+ENTRYPOINT ["./text-generation-launcher"]
+CMD ["--executor-worker", "/usr/local/tgi/bin/executorWorker"]
--- a/318
+++ b/318
@ -1,181 +1,201 @@
-Hugging Face Optimized Inference License 1.0 (HFOILv1.0)
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-This License Agreement governs the use of the Software and its Modifications. It is a
-binding agreement between the Licensor and You.
+   1. Definitions.

-This License Agreement shall be referred to as Hugging Face Optimized Inference License
-1.0 or HFOILv1.0. We may publish revised versions of this License Agreement from time to
-time. Each version will be given a distinguished number.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-By downloading, accessing, modifying, distributing or otherwise using the Software, You
-consent to all of the terms and conditions below. So, if You do not agree with those,
-please do not download, access, modify, distribute, or use the Software.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.

+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.

-1. PERMISSIONS
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.

-You may use, modify and distribute the Software pursuant to the following terms and
-conditions:
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.

-Copyright License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free, copyright license to reproduce, prepare, publicly display,
-publicly perform, sublicense under the terms herein, and distribute the Software and
-Modifications of the Software.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.

-Patent License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free patent license to make, have made, Use, offer to sell, sell,
-import, and otherwise transfer the Software, where such license applies only to those
-patent claims licensable by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s) with the Software to
-which such Contribution(s) was submitted. If You institute patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Software
-or a Contribution incorporated within the Software constitutes direct or contributory
-patent infringement, then any rights granted to You under this License Agreement for the
-Software shall terminate as of the date such litigation is filed.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).

-No other rights. All rights not expressly granted herein are retained.
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.

+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."

-2. RESTRICTIONS
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.

-You may not distribute the Software as a hosted or managed, and paid service, where the
-service grants users access to any substantial set of the features or functionality of the
-Software. If you wish to do so, You will need to be granted additional rights from the
-Licensor which will be subject to a separate mutually agreed agreement.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.

-You may not sublicense the Software under any other terms than those listed in this
-License.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.

+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:

-3. OBLIGATIONS
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and

-When You modify the Software, You agree to: - attach a notice stating the Modifications of
-the Software You made; and - attach a notice stating that the Modifications of the
-Software are released under this License Agreement.
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and

-When You distribute the Software or Modifications of the Software, You agree to: - give
-any recipients of the Software a copy of this License Agreement; - retain all Explanatory
-Documentation; and if sharing the Modifications of the Software, add Explanatory
-Documentation documenting the changes made to create the Modifications of the Software; -
-retain all copyright, patent, trademark and attribution notices.
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and

+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.

-4. MISCELLANEOUS
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.

-Termination. Licensor reserves the right to restrict Use of the Software in violation of
-this License Agreement, upon which Your licenses will automatically terminate.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.

-Contributions. Unless You explicitly state otherwise, any Contribution intentionally
-submitted for inclusion in the Software by You to the Licensor shall be under the terms
-and conditions of this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify the terms of any
-separate license agreement you may have executed with Licensor regarding such
-Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.

-Trademarks and related. Nothing in this License Agreement permits You (i) to make Use of
-Licensors’ trademarks, trade names, or logos, (ii) otherwise suggest endorsement by
-Licensor, or (iii) misrepresent the relationship between the parties; and any rights not
-expressly granted herein are reserved by the Licensors.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.

-Output You generate. Licensor claims no rights in the Output. You agree not to contravene
-any provision as stated in the License Agreement with your Use of the Output.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.

-Disclaimer of Warranty. Except as expressly provided otherwise herein, and to the fullest
-extent permitted by law, Licensor provides the Software (and each Contributor provides its
-Contributions) AS IS, and Licensor disclaims all warranties or guarantees of any kind,
-express or implied, whether arising under any law or from any usage in trade, or otherwise
-including but not limited to the implied warranties of merchantability, non-infringement,
-quiet enjoyment, fitness for a particular purpose, or otherwise. You are solely
-responsible for determining the appropriateness of the Software and Modifications of the
-Software for your purposes (including your use or distribution of the Software and
-Modifications of the Software), and assume any risks associated with Your exercise of
-permissions under this License Agreement.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.

-Limitation of Liability. In no event and under no legal theory, whether in tort (including
-negligence), contract, or otherwise, unless required by applicable law (such as deliberate
-and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to
-You for damages, including any direct, indirect, special, incidental, or consequential
-damages of any character arising as a result of this License Agreement or out of the Use
-or inability to Use the Software (including but not limited to damages for loss of
-goodwill, work stoppage, computer failure or malfunction, model failure or malfunction, or
-any and all other commercial damages or losses), even if such Contributor has been advised
-of the possibility of such damages.
+   END OF TERMS AND CONDITIONS

-Accepting Warranty or Additional Liability. While sharing the Software or Modifications of
-the Software thereof, You may choose to offer and charge a fee for, acceptance of support,
-warranty, indemnity, or other liability obligations and/or rights consistent with this
-License Agreement. However, in accepting such obligations, You may act only on Your own
-behalf and on Your sole responsibility, not on behalf of Licensor or any other
-Contributor, and you hereby agree to indemnify, defend, and hold Licensor and each other
-Contributor (and their successors or assigns) harmless for any liability incurred by, or
-claims asserted against, such Licensor or Contributor (and their successors or assigns) by
-reason of your accepting any such warranty or additional liability.
+   APPENDIX: How to apply the Apache License to your work.

-Severability. This License Agreement is a license of copyright and patent rights and an
-agreement in contract between You and the Licensor. If any provision of this License
-Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall
-be unaffected thereby and remain valid as if such provision had not been set forth herein.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.

+   Copyright 2022 Hugging Face

-5. DEFINITIONS
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at

-“Contribution” refers to any work of authorship, including the original version of the
-Software and any Modifications of the Software that is intentionally submitted to Licensor
-for inclusion in the Software by the copyright owner or by an individual or entity
-authorized to submit on behalf of the copyright owner. For the purposes of this
-definition, “submitted” means any form of electronic, verbal, or written communication
-sent to the Licensor or its representatives, including but not limited to communication on
-electronic mailing lists, source code control systems, and issue tracking systems that are
-managed by, or on behalf of, the Licensor for the purpose of discussing and improving the
-Software, but excluding communication that is conspicuously marked or otherwise designated
-in writing by the copyright owner as “Not a Contribution.”
+       http://www.apache.org/licenses/LICENSE-2.0

-“Contributor” refers to Licensor and any individual or entity on behalf of whom a
-Contribution has been received by Licensor and subsequently incorporated within the
-Software.
-
-“Data” refers to a collection of information extracted from the dataset used with the
-Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not
-licensed under this License Agreement.
-
-“Explanatory Documentation” refers to any documentation or related information including
-but not limited to model cards or data cards dedicated to inform the public about the
-characteristics of the Software. Explanatory documentation is not licensed under this
-License.
-
-"License Agreement" refers to these terms and conditions.
-
-“Licensor” refers to the rights owners or entity authorized by the rights owners that are
-granting the terms and conditions of this License Agreement.
-
-“Model” refers to machine-learning based assemblies (including checkpoints), consisting of
-learnt weights and parameters (including optimizer states), corresponding to a model
-architecture as embodied in Software source code. Source code is not licensed under this
-License Agreement.
-
-“Modifications of the Software” refers to all changes to the Software, including without
-limitation derivative works of the Software.
-
-“Output” refers to the results of operating the Software.
-
-“Share” refers to any transmission, reproduction, publication or other sharing of the
-Software or Modifications of the Software to a third party, including providing the
-Softwaire as a hosted service made available by electronic or other remote means,
-including - but not limited to - API-based or web access.
-
-“Software” refers to the software and Model (or parts of either) that Licensor makes
-available under this License Agreement.
-
-“Third Parties” refers to individuals or legal entities that are not under common control
-with Licensor or You.
-
-“Use” refers to anything You or your representatives do with the Software, including but
-not limited to generating any Output, fine tuning, updating, running, training, evaluating
-and/or reparametrizing the Model.
-
-"You" (or "Your")  refers to an individual or Legal Entity exercising permissions granted
-by this License Agreement and/or making Use of the Software for whichever purpose and in
-any field of Use.
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/26
+++ b/26
@ -1,23 +1,22 @@
 install-server:
 	cd server && make install

-install-custom-kernels:
-	if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
-
-install-integration-tests:
-	cd integration-tests && pip install -r requirements.txt
-	cd clients/python && pip install .
+install-server-cpu:
+	cd server && make install-server

 install-router:
-	cd router && cargo install --path .
+	cargo install --path backends/v3/

 install-launcher:
-	cd launcher && cargo install --path .
+	cargo install --path launcher/

 install-benchmark:
-	cd benchmark && cargo install --path .
+	cargo install --path benchmark/

-install: install-server install-router install-launcher install-custom-kernels
+install: install-server install-router install-launcher
+
+
+install-cpu: install-server-cpu install-router install-launcher

 server-dev:
 	cd server && make run-dev
@ -28,6 +27,10 @@ router-dev:
 rust-tests: install-router install-launcher
 	cargo test

+install-integration-tests:
+	cd integration-tests && pip install -r requirements.txt
+	cd clients/python && pip install .
+
 integration-tests: install-integration-tests
 	pytest -s -vv -m "not private" integration-tests

@ -50,3 +53,6 @@ run-falcon-7b-instruct-quantize:

 clean:
 	rm -rf target aml
+
+preview_doc:
+	doc-builder preview text-generation-inference docs/source --not_python_module
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 <div align="center">
-  
+
 <a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
-  <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
+  <img width=560 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
 </a>

 # Text Generation Inference
@ -13,26 +13,28 @@
  <img alt="Swagger API documentation" src="https://img.shields.io/badge/API-Swagger-informational">
 </a>

-A Rust, Python and gRPC server for text generation inference. Used in production at [HuggingFace](https://huggingface.co)
-to power Hugging Chat, the Inference API and Inference Endpoint.
+A Rust, Python and gRPC server for text generation inference. Used in production at [Hugging Face](https://huggingface.co)
+to power Hugging Chat, the Inference API and Inference Endpoints.

 </div>

 ## Table of contents

- [Get Started](#get-started)
-  - [API Documentation](#api-documentation)
-  - [Using a private or gated model](#using-a-private-or-gated-model)
-  - [A note on Shared Memory](#a-note-on-shared-memory-shm)
-  - [Distributed Tracing](#distributed-tracing)
-  - [Local Install](#local-install)
-  - [CUDA Kernels](#cuda-kernels)
- [Optimized architectures](#optimized-architectures)
- [Run Falcon](#run-falcon)
-  - [Run](#run)
-  - [Quantization](#quantization)
- [Develop](#develop)
- [Testing](#testing)
+  - [Get Started](#get-started)
+    - [Docker](#docker)
+    - [API documentation](#api-documentation)
+    - [Using a private or gated model](#using-a-private-or-gated-model)
+    - [A note on Shared Memory (shm)](#a-note-on-shared-memory-shm)
+    - [Distributed Tracing](#distributed-tracing)
+    - [Architecture](#architecture)
+    - [Local install](#local-install)
+    - [Local install (Nix)](#local-install-nix)
+  - [Optimized architectures](#optimized-architectures)
+  - [Run locally](#run-locally)
+    - [Run](#run)
+    - [Quantization](#quantization)
+  - [Develop](#develop)
+  - [Testing](#testing)

 Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:

@ -41,16 +43,34 @@ Text Generation Inference (TGI) is a toolkit for deploying and serving Large Lan
 - Tensor Parallelism for faster inference on multiple GPUs
 - Token streaming using Server-Sent Events (SSE)
 - Continuous batching of incoming requests for increased total throughput
+- [Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) compatible with Open AI Chat Completion API
 - Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323)
+- Quantization with :
+  - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+  - [GPT-Q](https://arxiv.org/abs/2210.17323)
+  - [EETQ](https://github.com/NetEase-FuXi/EETQ)
+  - [AWQ](https://github.com/casper-hansen/AutoAWQ)
+  - [Marlin](https://github.com/IST-DASLab/marlin)
+  - [fp8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
 - Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 - Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
 - Stop sequences
 - Log probabilities
+- [Speculation](https://huggingface.co/docs/text-generation-inference/conceptual/speculation) ~2x latency
+- [Guidance/JSON](https://huggingface.co/docs/text-generation-inference/conceptual/guidance). Specify output format to speed up inference and make sure the output is valid according to some specs..
 - Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
 - Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance

+### Hardware support
+
+- [Nvidia](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference)
+- [AMD](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference) (-rocm)
+- [Inferentia](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference)
+- [Intel GPU](https://github.com/huggingface/text-generation-inference/pull/1475)
+- [Gaudi](https://github.com/huggingface/tgi-gaudi)
+- [Google TPU](https://huggingface.co/docs/optimum-tpu/howto/serving)
+

 ## Get Started

@ -59,22 +79,49 @@ Text Generation Inference (TGI) is a toolkit for deploying and serving Large Lan
 For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:

 ```shell
-model=tiiuae/falcon-7b-instruct
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+model=HuggingFaceH4/zephyr-7b-beta
+# share a volume with the Docker container to avoid downloading weights every run
+volume=$PWD/data

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.1 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model
 ```

 And then you can make requests like

 ```bash
-curl 127.0.0.1:8080/generate \
+curl 127.0.0.1:8080/generate_stream \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
    -H 'Content-Type: application/json'
 ```

-**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
+You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
+
+```bash
+curl localhost:8080/v1/chat/completions \
+    -X POST \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
+    -H 'Content-Type: application/json'
+```
+
+**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
+
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/installation_amd#using-tgi-with-amd-gpus). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:3.3.4-rocm --model-id $model` instead of the command above.

 To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
@ -88,29 +135,30 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat

 ### Using a private or gated model

-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
 `text-generation-inference`. This allows you to gain access to protected resources.

 For example, if you want to serve the gated Llama V2 model variants:

 1. Go to https://huggingface.co/settings/tokens
-2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+2. Copy your CLI READ token
+3. Export `HF_TOKEN=<your CLI READ token>`

 or with Docker:

 ```shell
-model=meta-llama/Llama-2-7b-chat-hf
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>

-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.1 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:3.3.4 --model-id $model
 ```

 ### A note on Shared Memory (shm)

 [`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by
-`PyTorch` to do distributed training/inference. `text-generation-inference` make
+`PyTorch` to do distributed training/inference. `text-generation-inference` makes
 use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models.

 In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if
@ -136,24 +184,39 @@ this will impact performance.
 ### Distributed Tracing

 `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
-by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
+overridden with the `--otlp-service-name` argument

 ### Architecture

-![image](https://github.com/huggingface/text-generation-inference/assets/3841370/38ba1531-ea0d-4851-b31a-a6d4ddc944b0)
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
+
+Detailed blogpost by Adyen on TGI inner workings: [LLM inference at scale with TGI (Martin Iglesias Goyanes - Adyen, 2024)](https://www.adyen.com/knowledge-hub/llm-inference-at-scale-with-tgi)

 ### Local install

 You can also opt to install `text-generation-inference` locally.

-First [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
-Python 3.9, e.g. using `conda`:
+First clone the repository and change directory into it:
+
+```shell
+git clone https://github.com/huggingface/text-generation-inference
+cd text-generation-inference
+```
+
+Then [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
+Python 3.9, e.g. using `conda` or `python venv`:

 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

-conda create -n text-generation-inference python=3.9
+#using conda
+conda create -n text-generation-inference python=3.11
 conda activate text-generation-inference
+
+#using python venv
+python3 -m venv .venv
+source .venv/bin/activate
 ```

 You may also need to install Protoc.
@ -178,7 +241,7 @@ Then run:

 ```shell
 BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
-make run-falcon-7b-instruct
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```

 **Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
@ -187,16 +250,48 @@ make run-falcon-7b-instruct
 sudo apt-get install libssl-dev gcc -y
 ```

-### CUDA Kernels
+### Local install (Nix)

-The custom CUDA kernels are only tested on NVIDIA A100s. If you have any installation or runtime issues, you can remove
-the kernels by using the `DISABLE_CUSTOM_KERNELS=True` environment variable.
+Another option is to install `text-generation-inference` locally using [Nix](https://nixos.org). Currently,
+we only support Nix on x86_64 Linux with CUDA GPUs. When using Nix, all dependencies can
+be pulled from a binary cache, removing the need to build them locally.

-Be aware that the official Docker image has them enabled by default.
+First follow the instructions to [install Cachix and enable the Hugging Face cache](https://app.cachix.org/cache/huggingface).
+Setting up the cache is important, otherwise Nix will build many of the dependencies
+locally, which can take hours.
+
+After that you can run TGI with `nix run`:
+
+```shell
+cd text-generation-inference
+nix run --extra-experimental-features nix-command --extra-experimental-features flakes . -- --model-id meta-llama/Llama-3.1-8B-Instruct
+```
+
+**Note:** when you are using Nix on a non-NixOS system, you have to [make some symlinks](https://danieldk.eu/Nix-CUDA-on-non-NixOS-systems#make-runopengl-driverlib-and-symlink-the-driver-library)
+to make the CUDA driver libraries visible to Nix packages.
+
+For TGI development, you can use the `impure` dev shell:
+
+```shell
+nix develop .#impure
+
+# Only needed the first time the devshell is started or after updating the protobuf.
+(
+cd server
+mkdir text_generation_server/pb || true
+python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
+       --grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
+find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+touch text_generation_server/pb/__init__.py
+)
+```
+
+All development dependencies (cargo, Python, Torch), etc. are available in this
+dev shell.

 ## Optimized architectures

-TGI works out of the box to serve optimized models in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).
+TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).

 Other architectures are supported on a best-effort basis using:

@ -208,24 +303,26 @@ or



-## Run Falcon
+## Run locally

 ### Run

 ```shell
-make run-falcon-7b-instruct
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```

 ### Quantization

-You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
+You can also run pre-quantized weights (AWQ, GPTQ, Marlin) or on-the-fly quantize weights with bitsandbytes, EETQ, fp8, to reduce the VRAM requirement:

 ```shell
-make run-falcon-7b-instruct-quantize
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
 ```

 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.

+Read more about quantization in the [Quantization documentation](https://huggingface.co/docs/text-generation-inference/en/conceptual/quantization).
+
 ## Develop

 ```shell
--- a/assets/architecture.jpg
+++ b/assets/architecture.jpg
--- a/assets/architecture.png
+++ b/assets/architecture.png
--- a/assets/tgi_grafana.json
+++ b/assets/tgi_grafana.json
--- a/assets/v3_benchmarks.png
+++ b/assets/v3_benchmarks.png
--- a/backends/client/Cargo.toml
+++ b/backends/client/Cargo.toml
@ -6,6 +6,8 @@ authors.workspace = true
 homepage.workspace = true

 [dependencies]
+async-trait = "^0.1"
+base64 = { workspace = true }
 futures = "^0.3"
 grpc-metadata = { path = "../grpc-metadata" }
 prost = "^0.12"
--- a/backends/client/build.rs
+++ b/backends/client/build.rs
@ -0,0 +1,35 @@
+use std::fs;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    println!("cargo:rerun-if-changed=../../proto/");
+
+    fs::create_dir_all("src/v2/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/v2/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
+        .map_err(|e| match e.kind(){
+            std::io::ErrorKind::NotFound => {panic!("`protoc` not found, install libprotoc")},
+            std::io::ErrorKind::Other => {panic!("`protoc` version unsupported, upgrade protoc: https://github.com/protocolbuffers/protobuf/releases")},
+            e => {e}
+        }).unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    fs::create_dir_all("src/v3/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/v3/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    Ok(())
+}
--- a/backends/client/src/lib.rs
+++ b/backends/client/src/lib.rs
@ -0,0 +1,91 @@
+//! Text Generation gRPC client library
+
+pub mod v2;
+pub mod v3;
+
+use async_trait::async_trait;
+use base64::{engine::general_purpose::STANDARD, Engine};
+use thiserror::Error;
+use tonic::transport;
+use tonic::Status;
+
+pub use v3::{Chunk, Image, Input, InputChunk};
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
+
+#[derive(Error, Debug, Clone)]
+pub enum ClientError {
+    #[error("Could not connect to Text Generation server: {0}")]
+    Connection(String),
+    #[error("Server error: {0}")]
+    Generation(String),
+    #[error("Sharded results are empty")]
+    EmptyResults,
+}
+
+impl From<Status> for ClientError {
+    fn from(err: Status) -> Self {
+        let err = Self::Generation(err.message().to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+impl From<transport::Error> for ClientError {
+    fn from(err: transport::Error) -> Self {
+        let err = Self::Connection(err.to_string());
+        tracing::error!("{err}");
+        err
+    }
+}
+
+// Small convenience re-wrapping of `Chunk`.
+impl From<Chunk> for InputChunk {
+    fn from(chunk: Chunk) -> Self {
+        InputChunk { chunk: Some(chunk) }
+    }
+}
+
+/// Convert input chunks to a stringly-typed input for backwards
+/// compat for backends that haven't implemented chunked inputs.
+pub trait ChunksToString {
+    /// Convert chunks to string.
+    fn chunks_to_string(&self) -> String;
+}
+
+impl ChunksToString for Vec<InputChunk> {
+    fn chunks_to_string(&self) -> String {
+        let mut output = String::new();
+        self.iter().for_each(|c| match &c.chunk {
+            Some(Chunk::Text(text)) => output.push_str(text),
+            Some(Chunk::Image(Image { data, mimetype })) => {
+                let encoded = STANDARD.encode(data);
+                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
+            }
+            // We don't create empty chunks, so this should be unreachable.
+            None => unreachable!("Chunks should never be empty"),
+        });
+        output
+    }
+}
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
+pub type Result<T> = std::result::Result<T, ClientError>;
--- a/backends/client/src/v2/client.rs
+++ b/backends/client/src/v2/client.rs
@ -1,9 +1,13 @@
 /// Single shard Client
-use crate::pb::generate::v1::text_generation_service_client::TextGenerationServiceClient;
-use crate::pb::generate::v1::*;
-use crate::Result;
+use crate::v2::pb;
+use crate::{ClientError, Result};
+
+use crate::WARMUP_IMAGE_BASE64;
 use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v2::*;
 use std::cmp::min;
+use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;

@ -41,7 +45,9 @@ impl Client {
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
-        let response = self.stub.service_discovery(request).await?;
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v2 interface".to_string())
+        })?;
        let urls = response
            .into_inner()
            .urls
@ -104,16 +110,28 @@ impl Client {
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let mut n_tokens = 0;
        let mut requests = Vec::new();
        // Create requests
        while n_tokens < max_prefill_tokens {
            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
            requests.push(Request {
                id: 0,
+                inputs,
                // We truncate the input on the server side to be sure that it has the correct size
-                inputs: "_test ".to_string().repeat(max_input_length as usize),
                truncate,
                // Set sampling parameters to also take these ops into account in the max memory
                parameters: Some(NextTokenChooserParameters {
@ -124,7 +142,10 @@ impl Client {
                    do_sample: false,
                    seed: 0,
                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
                }),
                stopping_parameters: Some(StoppingCriteriaParameters {
                    max_new_tokens: max_total_tokens - truncate,
@ -135,6 +156,11 @@ impl Client {
                top_n_tokens: 20,
            });
            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
        }

        let batch = Batch {
@ -144,7 +170,13 @@ impl Client {
            max_tokens: 0,
        };

-        let request = tonic::Request::new(WarmupRequest { batch: Some(batch) }).inject_context();
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
        let response = self.stub.warmup(request).await?.into_inner();
        Ok(response.max_supported_total_tokens)
    }
@ -157,10 +189,14 @@ impl Client {
    pub async fn prefill(
        &mut self,
        batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
        let response = self.stub.prefill(request).await?.into_inner();
-        Ok((response.generations, response.batch))
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
    }

    /// Generate one token for each request in the given cached batches
@ -171,9 +207,52 @@ impl Client {
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
        let response = self.stub.decode(request).await?.into_inner();
-        Ok((response.generations, response.batch))
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
    }
 }
--- a/backends/client/src/v2/mod.rs
+++ b/backends/client/src/v2/mod.rs
@ -0,0 +1,13 @@
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod client;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v2::HealthResponse;
+pub use pb::generate::v2::{
+    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, InfoResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
--- a/backends/client/src/v2/sharded_client.rs
+++ b/backends/client/src/v2/sharded_client.rs
@ -1,9 +1,17 @@
 /// Multi shard Client
-use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
+use crate::{v2, Health, ShardInfo};
 use crate::{ClientError, Result};
+
+use crate::v2::InfoResponse;
+use async_trait::async_trait;
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
+use v2::client::{DecodeTimings, PrefillTimings};
+use v2::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};

 #[derive(Debug, Clone)]
 /// Text Generation Inference gRPC multi client
@ -46,7 +54,7 @@ impl ShardedClient {
            .iter_mut()
            .map(|client| client.info())
            .collect();
-        join_all(futures).await.pop().unwrap()
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }

    /// GRPC health check
@ -96,12 +104,18 @@ impl ShardedClient {
        max_input_length: u32,
        max_prefill_tokens: u32,
        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
    ) -> Result<Option<u32>> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| {
-                Box::pin(client.warmup(max_input_length, max_prefill_tokens, max_total_tokens))
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
            })
            .collect();
        // Take the minimum value
@ -116,49 +130,122 @@ impl ShardedClient {
    ///
    /// Returns Generation for each request in batch
    /// and the next cached batch
-    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
    pub async fn prefill(
        &mut self,
        batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.prefill(batch.clone())))
            .collect();
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
            join_all(futures).await.into_iter().collect();
-        merge_generations(results?)
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
    }

    /// Generate one token for each request in the given cached batches
    ///
    /// Returns Generation for each request in batches
    /// and the next cached batch
-    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
    pub async fn decode(
        &mut self,
        batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| Box::pin(client.decode(batches.clone())))
            .collect();
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
            join_all(futures).await.into_iter().collect();
-        merge_generations(results?)
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
    }
 }

-/// Merge generations from the different model shards
-fn merge_generations(
-    mut results: Vec<(Vec<Generation>, Option<CachedBatch>)>,
-) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
-    let (mut generations, next_batch) = results.pop().ok_or(ClientError::EmptyResults)?;
-
-    for (mut shard_generations, _) in results.into_iter() {
-        generations.append(&mut shard_generations);
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
    }
-    Ok((generations, next_batch))
 }
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
@ -0,0 +1,304 @@
+use crate::v3::{pb, Chunk};
+use crate::{ClientError, Result, WARMUP_IMAGE_BASE64};
+/// Single shard Client
+use base64::engine::general_purpose::STANDARD;
+use base64::Engine;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_tokens: Option<u32>,
+        max_prefill_tokens: u32,
+        max_total_tokens: Option<u32>,
+        max_batch_size: Option<usize>,
+    ) -> Result<(Option<u32>, u32, u32)> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let mut truncate = max_prefill_tokens - n_tokens;
+            if let Some(max_input_tokens) = max_input_tokens {
+                truncate = min(max_input_tokens, truncate);
+            }
+
+            let mut input_chunks = Vec::new();
+            input_chunks.push(Chunk::Text("_test ".to_string().repeat(truncate as usize)).into());
+            if n_tokens == 0 {
+                input_chunks.push(
+                    Chunk::Image(Image {
+                        // Safe unwrap, because we control the data.
+                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
+                        mimetype: "image/jpeg;base64".to_string(),
+                    })
+                    .into(),
+                );
+            }
+
+            // Send stringly-typed inputs for compatibility for backends that haven't
+            // been updated to support chunks.
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(truncate as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            let max_new_tokens = if let Some(max_total_tokens) = max_total_tokens {
+                max_total_tokens - truncate
+            } else {
+                1
+            };
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                input_chunks: Some(Input {
+                    chunks: input_chunks,
+                }),
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Most request will have that
+                add_special_tokens: true,
+                // Blocks and slots will be set on the server side if we use paged attention
+                blocks: vec![],
+                slots: vec![],
+                cache_len: 0,
+                chunk_len: None,
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+                adapter_id: None,
+            });
+            n_tokens += truncate;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: max_input_tokens.unwrap_or(0),
+            max_blocks: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_tokens,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok((
+            response.max_supported_total_tokens,
+            response.max_input_tokens,
+            response.max_total_tokens,
+        ))
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+        cached_batch: Option<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest {
+            batch: Some(batch),
+            cached_batch,
+        })
+        .inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
--- a/backends/client/src/v3/mod.rs
+++ b/backends/client/src/v3/mod.rs
@ -0,0 +1,13 @@
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod client;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
@ -0,0 +1,271 @@
+/// Multi shard Client
+use crate::{v3, Health, ShardInfo};
+use crate::{ClientError, Result};
+
+use crate::v3::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+use v3::client::{DecodeTimings, PrefillTimings};
+use v3::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: Option<u32>,
+        max_prefill_tokens: u32,
+        max_total_tokens: Option<u32>,
+        max_batch_size: Option<usize>,
+    ) -> Result<(Option<u32>, u32, u32)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<(Option<u32>, u32, u32)>>>()?;
+
+        // Take the minimum value
+        // Different shards hold different parts of vocab, might yield
+        // different available block size.
+        let min = results
+            .iter()
+            .min()
+            .expect("Expect at least 1 warmup result");
+        Ok(*min)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+        cached_batch: Option<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            add_special_tokens: true,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+            // Block 0 is reserved for health checks
+            blocks: vec![0],
+            slots: (0..16).collect(),
+            cache_len: 0,
+            chunk_len: None,
+            adapter_id: None,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+            max_blocks: 1,
+        };
+        self.clone().prefill(batch, None).await?;
+        Ok(())
+    }
+}
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@ -0,0 +1,67 @@
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+mkfile_dir := $(dir $(mkfile_path))
+root_dir := ${mkfile_dir}/../..
+
+HABANA_VERSION := 1.21.0
+PYTORCH_VERSION := 2.6.0
+
+.PHONY:	image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install
+
+image:
+	docker build --ulimit nofile=4096 -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)
+
+run-local-dev-container:
+		docker run -it \
+		--runtime=habana \
+		--ipc=host \
+		--cap-add=sys_nice \
+		--net=host \
+		-e HABANA_VISIBLE_DEVICES=all \
+		-e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+		-e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+		-e HF_TOKEN=`cat /home/ubuntu/.cache/huggingface/token` \
+		-e LOG_LEVEL=debug \
+		-e PORT=8080 \
+		-v /home/ubuntu/.cache/huggingface:/data \
+		-v $(PWD):/text-generation-inference \
+		-w /text-generation-inference \
+		vault.habana.ai/gaudi-docker/$(HABANA_VERSION)/ubuntu22.04/habanalabs/pytorch-installer-$(PYTORCH_VERSION):latest
+
+install-dependencies:
+	pip install git+https://github.com/HabanaAI/DeepSpeed.git@$(HABANA_VERSION)
+	pip install outlines~=0.0.34
+	curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+install-server:
+	make -C ${root_dir}/backends/gaudi/server install PROTO_PATH=../../../proto/v3
+
+install-router:
+	make -C ${root_dir} install-router
+
+install-launcher:
+	make -C ${root_dir} install-launcher
+
+# use source to load the rust in path
+local-dev-install: install-dependencies
+	bash -c 'source "$$HOME/.cargo/env" && \
+		make install-server && \
+		make install-router && \
+		make install-launcher'
+
+# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
+run-integration-tests:
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+    pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
+
+run-integration-tests-with-all-models:
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models
+
+# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
+capture-expected-outputs-for-integration-tests:
+	pip install -U pip uv
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests/capture_expected_outputs.py
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@ -0,0 +1,152 @@
+# Text-generation-inference - Gaudi backend
+
+## Description
+
+This is the TGI backend for Intel Gaudi. This backend is composed of the tgi server optimized for Gaudi hardware.
+
+## Build your own image
+
+The simplest way to build TGI with the Gaudi backend is to use the provided `Makefile`:
+
+Option 1: From the project root directory:
+```bash
+make -C backends/gaudi image
+```
+
+Option 2: From the Gaudi backend directory:
+```bash
+cd backends/gaudi
+make image
+```
+
+You can now run the server with the following command:
+
+Option 1: Sharded:
+```bash
+model=meta-llama/Llama-3.1-8B-Instruct
+hf_token=$(cat ${HOME}/.cache/huggingface/token)
+volume=${HOME}/.cache/huggingface
+
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --sharded true --num-shard 8 \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 8 --max-batch-prefill-tokens 2048
+```
+
+Option 2: Non-sharded:
+```bash
+model=meta-llama/Llama-3.1-8B-Instruct
+hf_token=$(cat ${HOME}/.cache/huggingface/token)
+volume=${HOME}/.cache/huggingface
+
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
+```
+
+## Contributing
+
+### Local Development
+
+This is useful if you want to run the server locally for better debugging.
+```bash
+make -C backends/gaudi run-local-dev-container
+```
+
+Then run the following command inside the container to install tgi for gaudi:
+```bash
+make -C backends/gaudi local-dev-install
+```
+
+Add rust to path:
+```bash
+. "$HOME/.cargo/env"
+```
+
+Option 1: Run the server (sharded model):
+```bash
+LOG_LEVEL=debug text-generation-launcher \
+    --model-id meta-llama/Llama-3.1-8B-Instruct \
+    --sharded true \
+    --num-shard 8 \
+    --max-input-tokens 512 \
+    --max-total-tokens 1024 \
+    --max-batch-size 8 \
+    --max-batch-prefill-tokens 2048
+```
+
+Option 2: Run the server (non-sharded model):
+```bash
+LOG_LEVEL=debug text-generation-launcher \
+    --model-id meta-llama/Llama-3.1-8B-Instruct \
+    --max-input-tokens 512 \
+    --max-total-tokens 1024 \
+    --max-batch-size 4 \
+    --max-batch-prefill-tokens 2048
+```
+
+You can then test the server with the following curl command from another terminal (can be outside the container):
+```bash
+curl 127.0.0.1:8080/generate \
+     -X POST \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+     -H 'Content-Type: application/json'
+```
+
+### Integration tests
+
+Install the dependencies:
+```bash
+pip install -r integration-tests/requirements.txt
+```
+
+To run the integration tests, you need to first build the image:
+```bash
+make -C backends/gaudi image
+```
+
+Then run the following command to run the integration tests (CI tests):
+```bash
+make -C backends/gaudi run-integration-tests
+```
+
+To run the integration tests with all models, you can run the following command:
+```bash
+make -C backends/gaudi run-integration-tests-with-all-models
+```
+
+To capture the expected outputs for the integration tests, you can run the following command:
+```bash
+make -C backends/gaudi capture-expected-outputs-for-integration-tests
+```
+
+#### How the integration tests works
+The integration tests works as follows:
+
+1. Start a tgi server in a container, similar to the command:
+```bash
+docker run --runtime=habana --ipc=host --cap-add=sys_nice \
+  -p 8080:80 -v $volume:/data \
+  -e LOG_LEVEL=debug -e HF_TOKEN=$hf_token \
+  tgi-gaudi --model-id $model \
+  --max-input-tokens 512 --max-total-tokens 1024 --max-batch-size 4 --max-batch-prefill-tokens 2048
+```
+
+2. Do a /generate request to the server, similar to the command:
+```bash
+curl 127.0.0.1:8080/generate \
+     -X POST \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+     -H 'Content-Type: application/json'
+```
+
+3. Check the output of the server against the expected output:
+```python
+assert curl_output == expected_output
+```
+
+This is the repeated for a set of models and configurations.
--- a/backends/gaudi/examples/docker_commands/docker_commands.md
+++ b/backends/gaudi/examples/docker_commands/docker_commands.md
@ -0,0 +1,283 @@
+# Examples of Docker Commands for Gaudi Backend
+
+This page gives a list of examples of docker run commands for some of the most popular models.
+
+> **Note:** The parameters are chosen for Gaudi2 hardware to maximize performance on this given hardware, please adjust the parameters based on your hardware. For example, if you are using Gaudi3, you may want to increase the batch size.
+
+## Default Precision (BF16)
+
+### Llama3.1-8B on 1 card (BF16)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+### Llama3.1-70B 8 cards (BF16)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### Llama2-7B on 1 Card (BF16)
+
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+### Llama2-70B on 8 cards (BF16)
+
+```bash
+model=meta-llama/Llama-2-70b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+### Llava-v1.6-Mistral-7B on 1 card (BF16)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+## FP8 Precision
+
+Please refer to the [FP8 Precision](https://huggingface.co/docs/text-generation-inference/backends/gaudi_new#how-to-use-different-precision-formats) section for more details. You need to measure the statistics of the model first before running the model in FP8 precision.
+
+## Llama3.1-8B on 1 Card (FP8)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+## Llama3.1-70B on 8 cards (FP8)
+
+```bash
+model=meta-llama/Meta-Llama-3.1-70B-Instruct
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+## Llama2-7B on 1 Card (FP8)
+
+```bash
+model=meta-llama/Llama-2-7b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e PREFILL_BATCH_BUCKET_SIZE=2 \
+   -e BATCH_BUCKET_SIZE=32 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=256 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 2048 --max-batch-size 32 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 64
+```
+
+## Llama2-70B on 8 Cards (FP8)
+
+```bash
+model=meta-llama/Llama-2-70b-chat-hf
+hf_token=YOUR_ACCESS_TOKEN
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+   -e HF_TOKEN=$hf_token \
+   -e MAX_TOTAL_TOKENS=2048 \
+   -e BATCH_BUCKET_SIZE=256 \
+   -e PREFILL_BATCH_BUCKET_SIZE=4 \
+   -e PAD_SEQUENCE_TO_MULTIPLE_OF=64 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 1024 --max-total-tokens 2048 \
+   --max-batch-prefill-tokens 4096 --max-batch-size 256 \
+   --max-waiting-tokens 7 --waiting-served-ratio 1.2 --max-concurrent-requests 512
+```
+
+## Llava-v1.6-Mistral-7B on 1 Card (FP8)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
+
+## Llava-v1.6-Mistral-7B on 8 Cards (FP8)
+
+```bash
+model=llava-hf/llava-v1.6-mistral-7b-hf
+volume=$PWD/data   # share a volume with the Docker container to avoid downloading weights every run
+
+docker run -p 8080:80 \
+   --runtime=habana \
+   --cap-add=sys_nice \
+   --ipc=host \
+   -v $volume:/data \
+   -v $PWD/quantization_config:/usr/src/quantization_config \
+   -v $PWD/hqt_output:/usr/src/hqt_output \
+   -e QUANT_CONFIG=./quantization_config/maxabs_quant.json \
+    -e PREFILL_BATCH_BUCKET_SIZE=1 \
+    -e BATCH_BUCKET_SIZE=1 \
+   ghcr.io/huggingface/text-generation-inference:3.1.1-gaudi \
+   --model-id $model \
+   --sharded true --num-shard 8 \
+   --max-input-tokens 4096 --max-batch-prefill-tokens 16384 \
+   --max-total-tokens 8192 --max-batch-size 4
+```
--- a/backends/gaudi/server/.gitignore
+++ b/backends/gaudi/server/.gitignore
@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+text_generation_server/__pycache__/
+text_generation_server/pb/__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+transformers
+safetensors
+flash-attention/
+flash-attention-v2/
+vllm/
+llm-awq/
+eetq/
+mamba/
--- a/backends/gaudi/server/Makefile
+++ b/backends/gaudi/server/Makefile
@ -0,0 +1,38 @@
+include Makefile-flash-att
+include Makefile-flash-att-v2
+include Makefile-vllm
+include Makefile-awq
+include Makefile-eetq
+include Makefile-selective-scan
+
+PROTO_PATH ?= ../proto/v3
+
+unit-tests:
+	pytest -s -vv -m "not private" tests
+
+gen-server:
+	# Compile protos
+	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
+	mkdir text_generation_server/pb || true
+	python -m grpc_tools.protoc -I$(PROTO_PATH) --python_out=text_generation_server/pb \
+		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb $(PROTO_PATH)/generate.proto
+	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch text_generation_server/pb/__init__.py
+
+install: gen-server
+	pip install pip --upgrade
+	pip install --no-deps -r requirements.txt
+	pip install -e "."
+
+run-dev:
+	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
+
+install-poetry:
+	curl -sSL https://install.python-poetry.org | python3 -
+
+update-lock:
+	rm poetry.lock
+	poetry lock --no-update
+
+export-requirements:
+	poetry export -o requirements.txt --without-hashes
--- a/backends/gaudi/server/Makefile-awq
+++ b/backends/gaudi/server/Makefile-awq
@ -0,0 +1,15 @@
+# Fork that adds only the correct stream to this kernel in order
+# to make cuda graphs work.
+awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4
+
+awq:
+	rm -rf llm-awq
+	git clone https://github.com/huggingface/llm-awq
+
+build-awq: awq
+	cd llm-awq/ && git fetch && git checkout $(awq_commit)
+	cd llm-awq/awq/kernels && python setup.py build
+
+install-awq: build-awq
+	pip uninstall awq_inference_engine -y || true
+	cd llm-awq/awq/kernels && python setup.py install
--- a/backends/gaudi/server/Makefile-eetq
+++ b/backends/gaudi/server/Makefile-eetq
@ -0,0 +1,13 @@
+eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0
+
+eetq:
+    # Clone eetq
+	pip install packaging
+	git clone https://github.com/NetEase-FuXi/EETQ.git eetq
+
+build-eetq: eetq
+	cd eetq && git fetch && git checkout $(eetq_commit) && git submodule update --init --recursive
+	cd eetq && python setup.py build
+
+install-eetq: build-eetq
+	cd eetq && python setup.py install
--- a/backends/gaudi/server/Makefile-fbgemm
+++ b/backends/gaudi/server/Makefile-fbgemm
@ -0,0 +1,15 @@
+fbgemm_commit := v0.8.0
+
+build-fbgemm:
+	@if [ ! -d "fbgemm" ]; then \
+		git clone https://github.com/pytorch/FBGEMM.git fbgemm; \
+	fi
+	cd fbgemm && git fetch && git checkout $(fbgemm_commit)  && \
+	git submodule update --init --recursive && \
+	cd fbgemm_gpu && \
+	pip install -r requirements.txt && \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai build
+
+install-fbgemm: build-fbgemm
+	cd fbgemm/fbgemm_gpu &&  \
+	CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py --package_variant genai install
--- a/backends/gaudi/server/Makefile-flash-att
+++ b/backends/gaudi/server/Makefile-flash-att
@ -0,0 +1,12 @@
+flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
+
+build-flash-attention:
+	if [ ! -d 'flash-attention' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/HazyResearch/flash-attention.git; \
+	fi
+	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
+	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build
+
+install-flash-attention: build-flash-attention
+	cd flash-attention && git checkout $(flash_att_commit) && MAX_JOBS=8 python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
--- a/backends/gaudi/server/Makefile-flash-att-v2
+++ b/backends/gaudi/server/Makefile-flash-att-v2
@ -0,0 +1,21 @@
+flash_att_v2_commit_cuda := v2.6.1
+flash_att_v2_commit_rocm := 2092111b9f975b3347c652ff7fabd431130256c4
+
+build-flash-attention-v2-cuda:
+	pip install -U packaging wheel
+	pip install flash-attn==$(flash_att_v2_commit_cuda)
+
+install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
+	echo "Flash v2 installed"
+
+build-flash-attention-v2-rocm:
+	if [ ! -d 'flash-attention-v2' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/mht-sharma/flash-attention.git flash-attention-v2 && \
+		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
+		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
+	fi
+
+install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
+	cd flash-attention-v2 &&  \
+	GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
--- a/backends/gaudi/server/Makefile-selective-scan
+++ b/backends/gaudi/server/Makefile-selective-scan
@ -0,0 +1,28 @@
+selective_scan_commit := 2a3704fd47ba817b415627b06fd796b971fdc137
+
+causal-conv1d:
+	rm -rf causal-conv1d
+	git clone https://github.com/Dao-AILab/causal-conv1d.git
+
+build-causal-conv1d: causal-conv1d
+	cd causal-conv1d/ && git checkout v1.1.1 # known latest working version tag
+	cd causal-conv1d/ && CAUSAL_CONV1D_FORCE_BUILD=TRUE python setup.py build
+
+install-causal-conv1d: build-causal-conv1d
+	pip uninstall causal-conv1d -y || true
+	cd causal-conv1d/ && pip install .
+
+# selective-scan dependends on causal-conv1d
+selective-scan:
+	rm -rf mamba
+	git clone https://github.com/state-spaces/mamba.git mamba
+
+build-selective-scan: selective-scan
+	cd mamba/ && git fetch && git checkout $(selective_scan_commit)
+	cd mamba && python setup.py build
+
+install-selective-scan: install-causal-conv1d build-selective-scan
+	pip uninstall selective-scan-cuda -y || true
+	cd mamba && pip install .
+
+build-all: build-causal-conv1d build-selective-scan
--- a/backends/gaudi/server/Makefile-vllm
+++ b/backends/gaudi/server/Makefile-vllm
@ -0,0 +1,23 @@
+commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
+commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
+build-vllm-cuda:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/Narsil/vllm.git vllm; \
+	fi
+	cd vllm  && git fetch origin && git checkout $(commit_cuda) && python setup.py build
+
+install-vllm-cuda: build-vllm-cuda
+	cd vllm  && git fetch origin && git checkout $(commit_cuda) && pip install -e .
+
+build-vllm-rocm:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/mht-sharma/vllm.git vllm; \
+	fi
+	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
+
+install-vllm-rocm: build-vllm-rocm
+	cd vllm && git fetch && git checkout $(commit_rocm) && \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
--- a/backends/gaudi/server/README.md
+++ b/backends/gaudi/server/README.md
@ -0,0 +1,15 @@
+# Text Generation Inference Python gRPC Server
+
+A Python gRPC server for Text Generation Inference
+
+## Install
+
+```shell
+make install
+```
+
+## Run
+
+```shell
+make run-dev
+```
--- a/backends/gaudi/server/dill-0.3.7-patch.sh
+++ b/backends/gaudi/server/dill-0.3.7-patch.sh
@ -0,0 +1,91 @@
+#!/bin/bash
+git clone -b dill-0.3.7 https://github.com/uqfoundation/dill.git
+pushd dill
+cat <<EOF > dill-0.3.7.patch
+diff --git a/dill/_dill.py b/dill/_dill.py
+index d0cf543..f6eb662 100644
+--- a/dill/_dill.py
+++ b/dill/_dill.py
+@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
+ XRangeType = range
+ from types import MappingProxyType as DictProxyType, new_class
+ from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
+-import __main__ as _main_module
+class _LazyMainModule(object):
+    _module = None
+    @property
+    def module(self):
+        if self._module is None:
+            import __main__ as _m_module
+            self._module = _m_module
+        return self._module
+_main_module = _LazyMainModule()
+ import marshal
+ import gc
+ # import zlib
+@@ -353,7 +361,7 @@ class Pickler(StockPickler):
+         _fmode = kwds.pop('fmode', None)
+         _recurse = kwds.pop('recurse', None)
+         StockPickler.__init__(self, file, *args, **kwds)
+-        self._main = _main_module
+        self._main = _main_module.module
+         self._diff_cache = {}
+         self._byref = settings['byref'] if _byref is None else _byref
+         self._strictio = False #_strictio
+@@ -435,12 +443,12 @@ class Unpickler(StockUnpickler):
+         settings = Pickler.settings
+         _ignore = kwds.pop('ignore', None)
+         StockUnpickler.__init__(self, *args, **kwds)
+-        self._main = _main_module
+        self._main = _main_module.module
+         self._ignore = settings['ignore'] if _ignore is None else _ignore
+
+     def load(self): #NOTE: if settings change, need to update attributes
+         obj = StockUnpickler.load(self)
+-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
+        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
+             if not self._ignore:
+                 # point obj class to main
+                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
+@@ -1194,11 +1202,11 @@ def save_module_dict(pickler, obj):
+         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
+         logger.trace(pickler, "# D1")
+-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
+    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
+         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
+         logger.trace(pickler, "# D3")
+-    elif '__name__' in obj and obj != _main_module.__dict__ \\
+    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
+             and type(obj['__name__']) is str \\
+             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
+         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
+diff --git a/dill/session.py b/dill/session.py
+index 74234ab..1be8d89 100644
+--- a/dill/session.py
+++ b/dill/session.py
+@@ -233,7 +233,7 @@ def dump_module(
+     protocol = settings['protocol']
+     main = module
+     if main is None:
+-        main = _main_module
+        main = _main_module.module
+     elif isinstance(main, str):
+         main = _import_module(main)
+     if not isinstance(main, ModuleType):
+@@ -501,7 +501,7 @@ def load_module(
+             pass
+     assert loaded is main
+     _restore_modules(unpickler, main)
+-    if main is _main_module or main is module:
+    if main is _main_module.module or main is module:
+         return None
+     else:
+         return main
+
+EOF
+git apply dill-0.3.7.patch
+python -m pip install .
+popd
+rm -fr dill
--- a/backends/gaudi/server/dill-0.3.8-patch.sh
+++ b/backends/gaudi/server/dill-0.3.8-patch.sh
@ -0,0 +1,91 @@
+#!/bin/bash
+git clone -b 0.3.8 https://github.com/uqfoundation/dill.git
+pushd dill
+cat <<EOF > dill-0.3.8.patch
+diff --git a/dill/_dill.py b/dill/_dill.py
+index d42432f..1d251e6 100644
+--- a/dill/_dill.py
+++ b/dill/_dill.py
+@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
+ XRangeType = range
+ from types import MappingProxyType as DictProxyType, new_class
+ from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
+-import __main__ as _main_module
+class _LazyMainModule(object):
+    _module = None
+    @property
+    def module(self):
+        if self._module is None:
+            import __main__ as _m_module
+            self._module = _m_module
+        return self._module
+_main_module = _LazyMainModule()
+ import marshal
+ import gc
+ # import zlib
+@@ -355,7 +363,7 @@ class Pickler(StockPickler):
+         _fmode = kwds.pop('fmode', None)
+         _recurse = kwds.pop('recurse', None)
+         StockPickler.__init__(self, file, *args, **kwds)
+-        self._main = _main_module
+        self._main = _main_module.module
+         self._diff_cache = {}
+         self._byref = settings['byref'] if _byref is None else _byref
+         self._strictio = False #_strictio
+@@ -437,12 +445,12 @@ class Unpickler(StockUnpickler):
+         settings = Pickler.settings
+         _ignore = kwds.pop('ignore', None)
+         StockUnpickler.__init__(self, *args, **kwds)
+-        self._main = _main_module
+        self._main = _main_module.module
+         self._ignore = settings['ignore'] if _ignore is None else _ignore
+
+     def load(self): #NOTE: if settings change, need to update attributes
+         obj = StockUnpickler.load(self)
+-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
+        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
+             if not self._ignore:
+                 # point obj class to main
+                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
+@@ -1199,11 +1207,11 @@ def save_module_dict(pickler, obj):
+         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
+         logger.trace(pickler, "# D1")
+-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
+    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
+         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
+         logger.trace(pickler, "# D3")
+-    elif '__name__' in obj and obj != _main_module.__dict__ \\
+    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
+             and type(obj['__name__']) is str \\
+             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
+         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
+diff --git a/dill/session.py b/dill/session.py
+index e91068a..a921b43 100644
+--- a/dill/session.py
+++ b/dill/session.py
+@@ -233,7 +233,7 @@ def dump_module(
+     protocol = settings['protocol']
+     main = module
+     if main is None:
+-        main = _main_module
+        main = _main_module.module
+     elif isinstance(main, str):
+         main = _import_module(main)
+     if not isinstance(main, ModuleType):
+@@ -501,7 +501,7 @@ def load_module(
+             pass
+     assert loaded is main
+     _restore_modules(unpickler, main)
+-    if main is _main_module or main is module:
+    if main is _main_module.module or main is module:
+         return None
+     else:
+         return main
+
+EOF
+git apply dill-0.3.8.patch
+python -m pip install .
+popd
+rm -fr dill
--- a/backends/gaudi/server/poetry.lock
+++ b/backends/gaudi/server/poetry.lock
--- a/backends/gaudi/server/pyproject.toml
+++ b/backends/gaudi/server/pyproject.toml
@ -0,0 +1,44 @@
+[tool.poetry]
+name = "text-generation-server"
+version = "2.0.4"
+description = "Text Generation Inference Python gRPC Server"
+authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+
+[tool.poetry.scripts]
+text-generation-server = 'text_generation_server.cli:app'
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.13"
+protobuf = "^5.0"
+grpcio = "^1.71.1"
+grpcio-status = "*"
+grpcio-reflection = "*"
+grpc-interceptor = "^0.15.0"
+typer = "^0.15.0"
+loguru = "^0.7.3"
+opentelemetry-api = "^1.32.0"
+opentelemetry-exporter-otlp = "^1.32.0"
+opentelemetry-instrumentation-grpc = "^0.53b0"
+hf-transfer = "^0.1.9"
+sentencepiece = "^0.2.0"
+peft = "^0.15"
+transformers = "^4.52.4"
+numpy = "^1.26"
+accelerate = "^1.7.0"
+outlines= { version = "^0.0.36", optional = true }
+prometheus-client = "^0.21.1"
+py-cpuinfo = "^9.0.0"
+
+[tool.poetry.group.dev.dependencies]
+grpcio-tools = "*"
+pytest = "^8.3.5"
+
+[tool.pytest.ini_options]
+markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.requires-plugins]
+poetry-plugin-export = ">=1.8"
--- a/backends/gaudi/server/requirements.txt
+++ b/backends/gaudi/server/requirements.txt
@ -0,0 +1,86 @@
+accelerate==1.7.0 ; python_version >= "3.9" and python_version < "3.13"
+annotated-types==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
+attrs==25.3.0 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2025.1.31 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.1 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.8 ; python_version >= "3.9" and python_version < "3.13"
+cloudpickle==3.1.1 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Windows" or python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+deprecated==1.2.18 ; python_version >= "3.9" and python_version < "3.13"
+diffusers==0.31.0 ; python_version >= "3.9" and python_version < "3.13"
+diskcache==5.6.3 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.18.0 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2025.3.2 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.70.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.71.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.71.0 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.72.0rc1 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.9 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.30.2 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
+importlib-metadata==8.6.1 ; python_version >= "3.9" and python_version < "3.13"
+interegular==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
+jinja2==3.1.6 ; python_version >= "3.9" and python_version < "3.13"
+joblib==1.4.2 ; python_version >= "3.9" and python_version < "3.13"
+jsonschema-specifications==2024.10.1 ; python_version >= "3.9" and python_version < "3.13"
+jsonschema==4.23.0 ; python_version >= "3.9" and python_version < "3.13"
+lark==1.2.2 ; python_version >= "3.9" and python_version < "3.13"
+llvmlite==0.43.0 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.7.3 ; python_version >= "3.9" and python_version < "3.13"
+markdown-it-py==3.0.0 ; python_version >= "3.9" and python_version < "3.13"
+markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "3.13"
+mdurl==0.1.2 ; python_version >= "3.9" and python_version < "3.13"
+mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
+nest-asyncio==1.6.0 ; python_version >= "3.9" and python_version < "3.13"
+networkx==3.2.1 ; python_version >= "3.9" and python_version < "3.13"
+numba==0.60.0 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-common==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
+optimum==1.24.0 ; python_version >= "3.9" and python_version < "3.13"
+outlines==0.0.36 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.2 ; python_version >= "3.9" and python_version < "3.13"
+peft==0.15.1 ; python_version >= "3.9" and python_version < "3.13"
+pillow==11.2.1 ; python_version >= "3.9" and python_version < "3.13"
+prometheus-client==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==5.29.4 ; python_version >= "3.9" and python_version < "3.13"
+psutil==7.0.0 ; python_version >= "3.9" and python_version < "3.13"
+py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pydantic-core==2.33.1 ; python_version >= "3.9" and python_version < "3.13"
+pydantic==2.11.3 ; python_version >= "3.9" and python_version < "3.13"
+pygments==2.19.1 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
+referencing==0.36.2 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.11.6 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
+rich==14.0.0 ; python_version >= "3.9" and python_version < "3.13"
+rpds-py==0.24.0 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.5.3 ; python_version >= "3.9" and python_version < "3.13"
+scikit-learn==1.6.1 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+sentence-transformers==3.3.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==78.1.0 ; python_version >= "3.12" and python_version < "3.13"
+shellingham==1.5.4 ; python_version >= "3.9" and python_version < "3.13"
+sympy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+threadpoolctl==3.6.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.67.1 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.52.4 ; python_version >= "3.9" and python_version < "3.13"
+triton==3.2.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
+typer==0.15.2 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.13.2 ; python_version >= "3.9" and python_version < "3.13"
+typing-inspection==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.4.0 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.2.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.17.2 ; python_version >= "3.9" and python_version < "3.13"
+zipp==3.21.0 ; python_version >= "3.9" and python_version < "3.13"
--- a/backends/gaudi/server/text_generation_server/init.py
+++ b/backends/gaudi/server/text_generation_server/init.py
--- a/backends/gaudi/server/text_generation_server/adapters/init.py
+++ b/backends/gaudi/server/text_generation_server/adapters/init.py
@ -0,0 +1,13 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/__init__.py
+# License:  Apache License Version 2.0, January 2004
+
+from text_generation_server.adapters.weights import (
+    AdapterBatchData,
+    AdapterBatchMetadata,
+)
+
+__all__ = [
+    "AdapterBatchData",
+    "AdapterBatchMetadata",
+]
--- a/backends/gaudi/server/text_generation_server/adapters/config.py
+++ b/backends/gaudi/server/text_generation_server/adapters/config.py
@ -0,0 +1,30 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/config.py
+# License:  Apache License Version 2.0, January 2004
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Dict, Set, Tuple
+
+import torch
+
+from text_generation_server.adapters.weights import AdapterWeights
+
+
+@dataclass
+class ModuleMap:
+    module_name: str
+    module_weights: Dict[str, Tuple[torch.Tensor, str]]
+
+
+@dataclass
+class AdapterConfig(ABC):
+    base_model_name_or_path: str
+
+    @abstractmethod
+    def map_weights_for_model(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        weight_names: Tuple[str],
+    ) -> Tuple[ModuleMap, Set[str]]:
+        pass
--- a/backends/gaudi/server/text_generation_server/adapters/lora.py
+++ b/backends/gaudi/server/text_generation_server/adapters/lora.py
@ -0,0 +1,471 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/lora.py
+# License:  Apache License Version 2.0, January 2004
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+import torch
+from peft import LoraConfig as _LoraConfig
+from torch.distributed import ProcessGroup
+
+from text_generation_server.adapters.config import AdapterConfig, ModuleMap
+
+from text_generation_server.adapters.weights import (
+    AdapterBatchMetadata,
+    AdapterWeights,
+    BatchAdapterWeights,
+)
+from text_generation_server.utils.sgmv import (
+    BGMV_MAX_RANK,
+    MAX_RANK_CUSTOM,
+    get_tmp_tensors,
+    orient_for_rank,
+    pad_rank,
+    use_cutlass_shrink,
+)
+
+
+def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
+    block_size = size // world_size
+    start = offset + rank * block_size
+    stop = offset + (rank + 1) * block_size
+    return start, stop
+
+
+def shard_on_dim(
+    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
+):
+    world_size = process_group.size()
+    rank = process_group.rank()
+
+    size = t.shape[dim]
+    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)
+
+    if dim == 0:
+        tensor = t[start:stop]
+    elif dim == 1:
+        tensor = t[:, start:stop]
+    else:
+        raise NotImplementedError("Let's make that generic when needed")
+
+    return tensor
+
+
+def shard_lora_weights(
+    weights_a: List[torch.Tensor],
+    weights_b: List[torch.Tensor],
+    split_dim: int,
+    process_group: ProcessGroup,
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    # [hidden_size, r]
+    weights_a = [
+        shard_on_dim(w, dim=split_dim, process_group=process_group) for w in weights_a
+    ]
+
+    # [r, hidden_size]
+    weights_b = [shard_on_dim(w, dim=1, process_group=process_group) for w in weights_b]
+
+    return weights_a, weights_b
+
+
+@dataclass
+class LoraConfig(AdapterConfig):
+    r: int
+    target_modules: Optional[Union[List[str], str]]
+    fan_in_fan_out: bool
+    lora_alpha: int
+    use_rslora: bool
+
+    def map_weights_for_model(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        weight_names: Tuple[str],
+    ) -> Tuple[ModuleMap, Set[str]]:
+        adapter_weight_names = set()
+        module_map = {}
+        for weight_name in weight_names:
+            lora_a_name = f"base_model.model.{weight_name}.lora_A.weight"
+            lora_b_name = f"base_model.model.{weight_name}.lora_B.weight"
+            if lora_a_name not in adapter_weights or lora_b_name not in adapter_weights:
+                continue
+
+            module_map[weight_name] = {
+                "lora_A": (adapter_weights[lora_a_name], lora_a_name),
+                "lora_B": (adapter_weights[lora_b_name], lora_b_name),
+            }
+            adapter_weight_names.add(lora_a_name)
+            adapter_weight_names.add(lora_b_name)
+        return module_map, adapter_weight_names
+
+    @classmethod
+    def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
+        hf_config = _LoraConfig.from_pretrained(adapter_id, token=api_token)
+        return cls(
+            base_model_name_or_path=hf_config.base_model_name_or_path,
+            r=hf_config.r,
+            target_modules=hf_config.target_modules,
+            fan_in_fan_out=hf_config.fan_in_fan_out,
+            lora_alpha=hf_config.lora_alpha,
+            use_rslora=(
+                hf_config.use_rslora if hasattr(hf_config, "use_rslora") else False
+            ),
+        )
+
+
+class LoraWeights(AdapterWeights):
+    """LoRA weights for a single adapter merged across all layers."""
+
+    def __init__(
+        self,
+        weights_a: List[torch.Tensor],
+        weights_b: List[torch.Tensor],
+        adapter_config: LoraConfig,
+    ):
+        self.lora_a_r = weights_a[0].size(1) if len(weights_a) > 0 else 1
+        self.lora_b_r = weights_b[0].size(0) if len(weights_a) > 0 else 1
+
+        self._use_cutlass_shrink = use_cutlass_shrink(self.lora_a_r)
+        self._is_transposed = False
+
+        # [num_layers, hidden_size, r]
+        weights_a = [orient_for_rank(w, w.size(1)).contiguous() for w in weights_a]
+        self._weights_a = torch.stack(weights_a)
+
+        # [num_layers, r, hidden_size]
+        self._weights_b = torch.stack(weights_b)
+
+        self.adapter_config = adapter_config
+
+    @property
+    def weights_a(self) -> torch.Tensor:
+        if self._is_transposed:
+            self._transpose_weights()
+        return self._weights_a
+
+    @property
+    def weights_b(self) -> torch.Tensor:
+        if self._is_transposed:
+            self._transpose_weights()
+        return self._weights_b
+
+    @property
+    def weights_a_t(self) -> torch.Tensor:
+        if not self._is_transposed:
+            self._transpose_weights()
+        return self._weights_a
+
+    @property
+    def weights_b_t(self) -> torch.Tensor:
+        if not self._is_transposed:
+            self._transpose_weights()
+        return self._weights_b
+
+    def _transpose_weights(self):
+        if self._use_cutlass_shrink:
+            # If we're not using the cutlass shrink, then both SGMV and BGMV use the same orientation
+            self._weights_a = self._weights_a.transpose(1, 2).contiguous()
+        self._weights_b = self._weights_b.transpose(1, 2).contiguous()
+        self._is_transposed = not self._is_transposed
+
+    @classmethod
+    def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
+        return [BatchLoraWeights]
+
+    # prepare pre-loaded lora weights for use in the model.
+    #
+    # this method processes and organizes lora weights for a specific layer type across all layers:
+    # - uses `config` (LoraConfig) to apply lora-specific settings like scaling factor.
+    # - retrieves weights from `module_map` based on the `layer_type`.
+    # - processes `nlayers` number of layers.
+    # - converts weights to the specified `dtype`.
+    # - shards weights across `world_size` number of processes using the `process_group`.
+    # - maps weights to specific layers using `target_to_layer`.
+    # - tracks `unused_weight_names` to identify any unused weights.
+    #
+    # the method handles weight transposition, scaling, and padding to ensure compatibility
+    # with SGMV or BGMV operations.
+    @classmethod
+    def prepare_weights(
+        cls,
+        config: LoraConfig,
+        module_map: Dict[str, Dict],
+        layer_type: str,
+        unused_weight_names: Set[str],
+        nlayers: int,
+        dtype: torch.dtype,
+        world_size: int,
+        process_group: ProcessGroup,
+        target_to_layer: Dict[str, Tuple[str, torch.Tensor]],
+    ) -> Optional[AdapterWeights]:
+        lora_a_list = [None] * nlayers
+        lora_b_list = [None] * nlayers
+
+        for layer_id in range(nlayers):
+            key = (layer_id, layer_type)
+            weight_name, layer = target_to_layer[key]
+            base_weight = layer.base_layer.linear.weight
+            base_device = base_weight.device
+
+            if weight_name not in module_map:
+                # There is no LoRA weight for this layer type in the adapter
+                return None
+
+            lora_a, lora_a_name = module_map[weight_name]["lora_A"]
+            lora_a = lora_a.to(base_device, dtype)
+
+            lora_b, lora_b_name = module_map[weight_name]["lora_B"]
+            lora_b = lora_b.to(base_device, dtype)
+
+            scale = get_scaling_factor(
+                config.lora_alpha,
+                config.r,
+                uses_rslora=config.use_rslora,
+            )
+
+            unused_weight_names.discard(lora_a_name)
+            unused_weight_names.discard(lora_b_name)
+
+            # Merge scaling factor into lora_b due to associativity of matrix multiplication:
+            # (A * B) * C = A * (B * C)
+            lora_a_list[layer_id] = lora_a.transpose(0, 1)
+            lora_b_list[layer_id] = lora_b.transpose(0, 1) * scale
+
+        # pad lora ranks to be compatible with sgmv
+        lora_a_list = [pad_rank(w, dim=1, world_size=world_size) for w in lora_a_list]
+        lora_b_list = [pad_rank(w, dim=0, world_size=world_size) for w in lora_b_list]
+
+        if lora_a_list:
+            # update rank if it was padded
+            padded_rank = lora_a_list[0].size(1)
+            config.r = padded_rank
+
+        return LoraWeights(
+            *shard_lora_weights(
+                weights_a=lora_a_list,
+                weights_b=lora_b_list,
+                split_dim=0 if layer_type in {"o_proj", "down_proj", "lm_head"} else 1,
+                process_group=process_group,
+            ),
+            config,
+        )
+
+
+@dataclass
+class RankSegments:
+    rank: int
+
+    lora_a_ptr: torch.Tensor
+    lora_b_ptr: torch.Tensor
+
+    # prefill (sgmv)
+    tmp_shrink: torch.Tensor
+    tmp_expand: torch.Tensor
+    segment_starts: torch.Tensor
+    segment_ends: torch.Tensor
+
+    # decode (bgmv)
+    indices: torch.Tensor
+
+
+@dataclass
+class BatchLoraWeights(BatchAdapterWeights):
+    lora_a: Dict[int, torch.Tensor]
+    lora_b: Dict[int, torch.Tensor]
+    adapter_index_configs: Dict[int, LoraConfig]
+    rank_data: Dict[int, RankSegments]
+    use_sgmv: bool
+
+    def has_adapter(self, adapter_index: int) -> bool:
+        return adapter_index in self.adapter_index_configs
+
+    def can_vectorize(self, pg: ProcessGroup) -> bool:
+        return all(
+            rank_data.rank // pg.size() <= MAX_RANK_CUSTOM
+            for rank_data in self.rank_data.values()
+        )
+
+    @classmethod
+    def load(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        meta: AdapterBatchMetadata,
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> Optional["BatchLoraWeights"]:
+        adapter_weights = {k: _convert_lora(v) for k, v in adapter_weights.items()}
+        adapter_weights = {
+            k: v for k, v in adapter_weights.items() if isinstance(v, LoraWeights)
+        }
+        if not adapter_weights:
+            return None
+
+        first_weights = next(iter(adapter_weights.values()))
+        device = first_weights.weights_a.device
+        segment_indices = meta.segment_indices
+
+        lora_a = {
+            idx: adapter_weights[idx].weights_a
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+        lora_b = {
+            idx: adapter_weights[idx].weights_b
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+
+        max_rank = max(
+            (
+                adapter_weights[idx].lora_a_r
+                for idx in segment_indices
+                if idx in adapter_weights
+            ),
+            default=0,
+        )
+
+        if prefill or max_rank > BGMV_MAX_RANK:
+            use_sgmv = True
+            lora_a_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_a.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            lora_b_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_b.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+        else:
+            use_sgmv = False
+            lora_a_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_a_t.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            lora_b_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_b_t.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+
+        adapter_index_configs = {
+            idx: adapter_weights[idx].adapter_config
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+
+        adapter_to_segment = {v: k for k, v in enumerate(segment_indices)}
+
+        rank_indices = defaultdict(list)
+        for segment_idx, adapter_idx in enumerate(segment_indices):
+            if adapter_idx not in adapter_weights:
+                continue
+            rank_indices[adapter_weights[adapter_idx].lora_a_r].append(segment_idx)
+
+        if prefill_head_indices is not None:
+            j, prefill_head_segment_starts, prefill_head_segment_ends = 1, [0], [0]
+            for head_index in prefill_head_indices:
+                # j cannot go out of bounds as that would mean there are tokens without corresponding adapters
+                if head_index < meta.adapter_segments[j]:
+                    prefill_head_segment_ends[-1] += 1
+                else:
+                    prefill_head_segment_starts.append(prefill_head_segment_ends[-1])
+                    prefill_head_segment_ends.append(prefill_head_segment_ends[-1] + 1)
+                    j += 1
+
+        rank_data = {}
+        for rank, indices in rank_indices.items():
+            tmp_shrink = None
+            tmp_expand = None
+            segment_starts = None
+            segment_ends = None
+            batch_indices = None
+
+            if use_sgmv:
+                lora_a_ptr_indices = lora_a_ptr[indices]
+                tmp_shrink, tmp_expand = get_tmp_tensors(
+                    lora_a_ptr_indices.size(0), rank, device
+                )
+                segment_starts = meta.adapter_segments[indices]
+                segment_ends = meta.adapter_segments[[i + 1 for i in indices]]
+                if prefill_head_indices is not None:
+                    for i, segment_index in enumerate(indices):
+                        segment_starts[i] = prefill_head_segment_starts[segment_index]
+                        segment_ends[i] = prefill_head_segment_ends[segment_index]
+            else:
+                rank_indices = set(indices)
+                batch_indices = [
+                    adapter_to_segment[idx] for idx in meta.adapter_indices.tolist()
+                ]
+                batch_indices = [
+                    idx if idx in rank_indices else -1 for idx in batch_indices
+                ]
+                batch_indices = torch.tensor(
+                    batch_indices, dtype=torch.int64, device=device
+                )
+
+            rank_data[rank] = RankSegments(
+                rank=rank,
+                tmp_shrink=tmp_shrink,
+                tmp_expand=tmp_expand,
+                lora_a_ptr=lora_a_ptr[indices],
+                lora_b_ptr=lora_b_ptr[indices],
+                segment_starts=segment_starts,
+                segment_ends=segment_ends,
+                indices=batch_indices,
+            )
+
+        return BatchLoraWeights(
+            lora_a=lora_a,
+            lora_b=lora_b,
+            adapter_index_configs=adapter_index_configs,
+            rank_data=rank_data,
+            use_sgmv=use_sgmv,
+        )
+
+
+def get_scaling_factor(
+    lora_alpha: int,
+    r: int,
+    uses_rslora: bool = False,
+) -> float:
+    """Computes the scaling factor for the lora weights."""
+    if uses_rslora:
+        return lora_alpha / (r**0.5)
+    return lora_alpha / r
+
+
+def _convert_lora(v: AdapterWeights) -> AdapterWeights:
+    if hasattr(v, "lora_weights"):
+        return v.lora_weights
+    return v
--- a/backends/gaudi/server/text_generation_server/adapters/weights.py
+++ b/backends/gaudi/server/text_generation_server/adapters/weights.py
@ -0,0 +1,146 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/weights.py
+# License:  Apache License Version 2.0, January 2004
+
+from abc import ABC, abstractclassmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Type
+
+import torch
+
+
+@dataclass
+class AdapterBatchMetadata:
+    # [batch_size]
+    adapter_indices: torch.Tensor
+
+    # [num_adapters]
+    adapter_set: Set[int]
+
+    # [num_segments + 1]
+    adapter_segments: torch.Tensor
+
+    # [num_segments]
+    # maps from segment index to adapter index, i.e.:
+    # segment_indices[s] == adapter_indices[i]
+    segment_indices: List[int]
+
+
+class AdapterWeights(ABC):
+    @abstractclassmethod
+    def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
+        pass
+
+    @property
+    def speculative_tokens(self) -> int:
+        return 0
+
+
+class BatchAdapterWeights(ABC):
+    @abstractclassmethod
+    def has_adapter(self, adapter_index: int) -> bool:
+        pass
+
+    @abstractclassmethod
+    def load(
+        cls,
+        adapter_weights: Dict[int, AdapterWeights],
+        meta: "AdapterBatchMetadata",
+        prefill: bool,
+        prefill_head_indices: torch.Tensor,
+    ) -> Optional["BatchAdapterWeights"]:
+        pass
+
+
+class LayerAdapterWeights:
+    """Adapter weights that apply to a particular layer."""
+
+    def __init__(self):
+        self.adapter_weights: Dict[int, AdapterWeights] = {}
+
+    def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
+        self.adapter_weights[adapter_idx] = weights
+
+    def remove_adapter(self, adapter_idx: int):
+        if adapter_idx not in self.adapter_weights:
+            return
+        del self.adapter_weights[adapter_idx]
+
+    def is_empty(self) -> bool:
+        return len(self.adapter_weights) == 0
+
+    def get_data(
+        self,
+        meta: AdapterBatchMetadata,
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> Dict[str, BatchAdapterWeights]:
+        # bucket adapters by batch class
+        adapter_batch_types: Dict[
+            Type[BatchAdapterWeights], Dict[int, AdapterWeights]
+        ] = defaultdict(dict)
+        for adapter_index, adapter_weights in self.adapter_weights.items():
+            for batch_type in adapter_weights.get_batch_types():
+                adapter_batch_types[batch_type][adapter_index] = adapter_weights
+
+        batch_data = {}
+        for batch_type, adapter_weights in adapter_batch_types.items():
+            batched_weights = batch_type.load(
+                adapter_weights, meta, prefill, prefill_head_indices
+            )
+            if batched_weights is not None:
+                batch_data = batched_weights
+        return batch_data
+
+
+@dataclass
+class AdapterBatchData:
+    meta: AdapterBatchMetadata
+
+    # layer type -> adapter type -> batch weight data
+    data: Dict[str, Dict[str, BatchAdapterWeights]]
+
+    prefill: bool
+
+    @staticmethod
+    def from_meta(
+        meta: AdapterBatchMetadata,
+        weights: Dict[str, LayerAdapterWeights],
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> "AdapterBatchData":
+        data = {}
+        for k, v in weights.items():
+            if v.is_empty():
+                continue
+            data[k] = v.get_data(
+                meta, prefill, prefill_head_indices if k == "lm_head" else None
+            )
+        return AdapterBatchData(meta=meta, data=data, prefill=prefill)
+
+    def ranks(self) -> Set[int]:
+        # TODO(travis): refactor to be less coupled to lora implementation
+        ranks = set()
+        for lora_data in self.data.values():
+            if lora_data is None:
+                continue
+
+            for rank_data in lora_data.rank_data.values():
+                ranks.add(rank_data.rank)
+
+        return ranks
+
+    def layer_names(self) -> Set[str]:
+        return set(self.data.keys())
+
+    def adapter_keys(self) -> Set[str]:
+        adapter_keys = set()
+        for layer_data in self.data.values():
+            adapter_keys.update(layer_data.keys())
+        return adapter_keys
+
+    @property
+    def max_rank(self) -> int:
+        ranks = self.ranks()
+        return max(ranks) if len(ranks) > 0 else 0
--- a/backends/gaudi/server/text_generation_server/cache.py
+++ b/backends/gaudi/server/text_generation_server/cache.py
@ -0,0 +1,34 @@
+import torch
+
+from typing import Dict, Optional, TypeVar
+
+from text_generation_server.models.types import Batch
+
+B = TypeVar("B", bound=Batch)
+
+
+class Cache:
+    def __init__(self):
+        self.cache: Dict[int, B] = {}
+
+    def pop(self, batch_id: int) -> Optional[B]:
+        return self.cache.pop(batch_id, None)
+
+    def set(self, entry: B):
+        if entry is not None:
+            self.cache[entry.batch_id] = entry
+
+    def delete(self, batch_id: int):
+        batch = self.pop(batch_id)
+        if batch is not None:
+            del batch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    def clear(self):
+        keys = list(self.cache.keys())
+        for k in keys:
+            self.delete(k)
+
+    def __len__(self):
+        return len(self.cache.keys())
--- a/backends/gaudi/server/text_generation_server/cli.py
+++ b/backends/gaudi/server/text_generation_server/cli.py
@ -0,0 +1,373 @@
+import os
+import sys
+import typer
+
+from pathlib import Path
+from loguru import logger
+from typing import Optional
+from enum import Enum
+from huggingface_hub import hf_hub_download
+from text_generation_server.utils.adapter import parse_lora_adapters
+
+
+app = typer.Typer()
+
+
+class Quantization(str, Enum):
+    gptq = "gptq"
+    awq = "awq"
+    fp8 = "fp8"
+    compressed_tensors = "compressed-tensors"
+
+
+class Dtype(str, Enum):
+    float16 = "float16"
+    bloat16 = "bfloat16"
+
+
+class KVCacheDtype(str, Enum):
+    fp8_e4m3fn = "fp8_e4m3fn"
+    fp8_e5m2 = "fp8_e5m2"
+
+
+@app.command()
+def serve(
+    model_id: str,
+    revision: Optional[str] = None,
+    sharded: bool = False,
+    quantize: Optional[Quantization] = None,
+    speculate: Optional[int] = None,
+    dtype: Optional[Dtype] = None,
+    kv_cache_dtype: Optional[KVCacheDtype] = None,
+    trust_remote_code: bool = False,
+    uds_path: Path = "/tmp/text-generation-server",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
+):
+    if sharded:
+        # assert (
+        #     os.getenv("RANK", None) is not None
+        # ), "RANK must be set when sharded is True"
+        assert (
+            os.getenv("WORLD_SIZE", None) is not None
+        ), "WORLD_SIZE must be set when sharded is True"
+        assert (
+            os.getenv("MASTER_ADDR", None) is not None
+        ), "MASTER_ADDR must be set when sharded is True"
+        assert (
+            os.getenv("MASTER_PORT", None) is not None
+        ), "MASTER_PORT must be set when sharded is True"
+
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    # Import here after the logger is added to log potential import exceptions
+    from text_generation_server import server
+    from text_generation_server.tracing import setup_tracing
+
+    # Setup OpenTelemetry distributed tracing
+    if otlp_endpoint is not None:
+        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
+
+    lora_adapters = parse_lora_adapters(os.getenv("LORA_ADAPTERS"))
+
+    # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
+    # and warn the user
+    if lora_adapters:
+        logger.warning("LoRA adapters enabled (experimental feature).")
+
+        if "CUDA_GRAPHS" in os.environ:
+            logger.warning(
+                "LoRA adapters incompatible with CUDA Graphs. Disabling CUDA Graphs."
+            )
+            global CUDA_GRAPHS
+            CUDA_GRAPHS = None
+
+    # Downgrade enum into str for easier management later on
+    quantize = None if quantize is None else quantize.value
+    dtype = "bfloat16" if dtype is None else dtype.value
+    kv_cache_dtype = None if kv_cache_dtype is None else kv_cache_dtype.value
+    logger.info(f"quantize={quantize} kv_cache_dtype={kv_cache_dtype}")
+    if dtype is not None and quantize not in {
+        None,
+        "bitsandbytes",
+        "bitsandbytes-nf4",
+        "bitsandbytes-fp4",
+        "gptq",
+        "awq",
+        "fp8",
+        "compressed-tensors",
+    }:
+        raise RuntimeError(
+            "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
+        )
+    server.serve(
+        model_id,
+        lora_adapters,
+        revision,
+        sharded,
+        quantize,
+        speculate,
+        dtype,
+        kv_cache_dtype,
+        trust_remote_code,
+        uds_path,
+        max_input_tokens,
+    )
+
+
+@app.command()
+def download_weights(
+    model_id: str,
+    revision: Optional[str] = None,
+    extension: str = ".safetensors",
+    auto_convert: bool = True,
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    trust_remote_code: bool = False,
+    merge_lora: bool = False,
+):
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    # Import here after the logger is added to log potential import exceptions
+    from text_generation_server import utils
+
+    # Test if files were already download
+    try:
+        utils.weight_files(model_id, revision, extension)
+        logger.info("Files are already present on the host. " "Skipping download.")
+        return
+    # Local files not found
+    except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
+        pass
+
+    is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
+        "WEIGHTS_CACHE_OVERRIDE", None
+    ) is not None
+
+    if not is_local_model:
+        # TODO: maybe reverse the default value of merge_lora?
+        # currently by default we don't merge the weights with the base model
+        if merge_lora:
+            try:
+                hf_hub_download(
+                    model_id, revision=revision, filename="adapter_config.json"
+                )
+                utils.download_and_unload_peft(
+                    model_id, revision, trust_remote_code=trust_remote_code
+                )
+                is_local_model = True
+                utils.weight_files(model_id, revision, extension)
+                return
+            except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+                pass
+        else:
+            try:
+                utils.peft.download_peft(
+                    model_id, revision, trust_remote_code=trust_remote_code
+                )
+            except Exception:
+                pass
+
+        try:
+            import json
+
+            config = hf_hub_download(
+                model_id, revision=revision, filename="config.json"
+            )
+            with open(config, "r") as f:
+                config = json.load(f)
+
+            base_model_id = config.get("base_model_name_or_path", None)
+            if base_model_id and base_model_id != model_id:
+                try:
+                    logger.info(f"Downloading parent model {base_model_id}")
+                    download_weights(
+                        model_id=base_model_id,
+                        revision="main",
+                        extension=extension,
+                        auto_convert=auto_convert,
+                        logger_level=logger_level,
+                        json_output=json_output,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+
+        # Try to download weights from the hub
+        try:
+            filenames = utils.weight_hub_files(model_id, revision, extension)
+            utils.download_weights(filenames, model_id, revision)
+            # Successfully downloaded weights
+            return
+
+        # No weights found on the hub with this extension
+        except utils.EntryNotFoundError as e:
+            # Check if we want to automatically convert to safetensors or if we can use .bin weights instead
+            if not extension == ".safetensors" or not auto_convert:
+                raise e
+
+    elif (Path(model_id) / "adapter_config.json").exists():
+        # Try to load as a local PEFT model
+        try:
+            utils.download_and_unload_peft(
+                model_id, revision, trust_remote_code=trust_remote_code
+            )
+            utils.weight_files(model_id, revision, extension)
+            return
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+    elif (Path(model_id) / "config.json").exists():
+        # Try to load as a local Medusa model
+        try:
+            import json
+
+            config = Path(model_id) / "config.json"
+            with open(config, "r") as f:
+                config = json.load(f)
+
+            base_model_id = config.get("base_model_name_or_path", None)
+            if base_model_id:
+                try:
+                    logger.info(f"Downloading parent model {base_model_id}")
+                    download_weights(
+                        model_id=base_model_id,
+                        revision="main",
+                        extension=extension,
+                        auto_convert=auto_convert,
+                        logger_level=logger_level,
+                        json_output=json_output,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+
+    # Try to see if there are local pytorch weights
+    try:
+        # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
+        try:
+            local_pt_files = utils.weight_files(model_id, revision, ".bin")
+        except Exception:
+            local_pt_files = utils.weight_files(model_id, revision, ".pt")
+
+    # No local pytorch weights
+    except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+        if extension == ".safetensors":
+            logger.warning(
+                f"No safetensors weights found for model {model_id} at revision {revision}. "
+                f"Downloading PyTorch weights."
+            )
+
+        # Try to see if there are pytorch weights on the hub
+        pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
+        # Download pytorch weights
+        local_pt_files = utils.download_weights(pt_filenames, model_id, revision)
+
+    if auto_convert:
+        if not trust_remote_code:
+            logger.warning(
+                "🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
+                "Pickle files are unsafe and can essentially contain remote code execution!"
+                "Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
+            )
+
+        logger.warning(
+            f"No safetensors weights found for model {model_id} at revision {revision}. "
+            f"Converting PyTorch weights to safetensors."
+        )
+
+        # Safetensors final filenames
+        local_st_files = [
+            p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
+            for p in local_pt_files
+        ]
+        try:
+            import transformers
+            import json
+
+            if is_local_model:
+                config_filename = os.path.join(model_id, "config.json")
+            else:
+                config_filename = hf_hub_download(
+                    model_id, revision=revision, filename="config.json"
+                )
+            with open(config_filename, "r") as f:
+                config = json.load(f)
+            architecture = config["architectures"][0]
+
+            class_ = getattr(transformers, architecture)
+
+            # Name for this varible depends on transformers version.
+            discard_names = getattr(class_, "_tied_weights_keys", [])
+
+        except Exception:
+            discard_names = []
+        # Convert pytorch weights to safetensors
+        utils.convert_files(local_pt_files, local_st_files, discard_names)
+
+
+@app.command()
+def quantize(
+    model_id: str,
+    output_dir: str,
+    revision: Optional[str] = None,
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    trust_remote_code: bool = False,
+    upload_to_model_id: Optional[str] = None,
+    percdamp: float = 0.01,
+    act_order: bool = False,
+    groupsize: int = 128,
+):
+    if revision is None:
+        revision = "main"
+    download_weights(
+        model_id=model_id,
+        revision=revision,
+        logger_level=logger_level,
+        json_output=json_output,
+    )
+    from text_generation_server.layers.gptq.quantize import quantize
+
+    quantize(
+        model_id=model_id,
+        bits=4,
+        groupsize=groupsize,
+        output_dir=output_dir,
+        revision=revision,
+        trust_remote_code=trust_remote_code,
+        upload_to_model_id=upload_to_model_id,
+        percdamp=percdamp,
+        act_order=act_order,
+        sym=True,
+    )
+
+
+if __name__ == "__main__":
+    app()
--- a/backends/gaudi/server/text_generation_server/interceptor.py
+++ b/backends/gaudi/server/text_generation_server/interceptor.py
@ -0,0 +1,45 @@
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company.
+
+import torch
+import grpc
+
+from google.rpc import status_pb2, code_pb2
+from grpc_status import rpc_status
+from grpc_interceptor.server import AsyncServerInterceptor
+from loguru import logger
+from typing import Callable, Any
+import traceback
+import os
+
+
+class ExceptionInterceptor(AsyncServerInterceptor):
+    async def intercept(
+        self,
+        method: Callable,
+        request_or_iterator: Any,
+        context: grpc.ServicerContext,
+        method_name: str,
+    ) -> Any:
+        try:
+            response = method(request_or_iterator, context)
+            return await response
+        except Exception as err:
+            trace = " " + traceback.format_exc() if os.environ.get("DUMP_STACK") else ""
+            method_name = method_name.split("/")[-1]
+            logger.exception(f"Method {method_name} encountered an error.")
+
+            # Runtime Error cannot be recovered from
+            if isinstance(err, RuntimeError):
+                exit(1)
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            from .utils.debug import dbg_trace
+
+            dbg_trace("EXCEPTION", traceback.format_exc())
+            await context.abort_with_status(
+                rpc_status.to_status(
+                    status_pb2.Status(code=code_pb2.INTERNAL, message=str(err) + trace)
+                )
+            )
--- a/backends/gaudi/server/text_generation_server/layers/init.py
+++ b/backends/gaudi/server/text_generation_server/layers/init.py
@ -0,0 +1,36 @@
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+)
+from text_generation_server.layers.linear import (
+    get_linear,
+    FastLinear,
+)
+from text_generation_server.layers.speculative import SpeculativeHead
+
+# Just to add the `load` methods.
+from text_generation_server.layers.layernorm import load_layer_norm
+from text_generation_server.layers.conv import load_conv2d
+from text_generation_server.layers.fp8 import Fp8Linear
+
+from text_generation_server.layers.lora import (
+    LoraLinear,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+
+__all__ = [
+    "get_linear",
+    "FastLinear",
+    "TensorParallelColumnLinear",
+    "TensorParallelRowLinear",
+    "TensorParallelEmbedding",
+    "SpeculativeHead",
+    "LoraLinear",
+    "Fp8Linear",
+    "TensorParallelMultiAdapterLinear",
+    "TensorParallelAdapterRowLinear",
+    "load_layer_norm",
+    "load_conv2d",
+]
--- a/backends/gaudi/server/text_generation_server/layers/attention/init.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/init.py
@ -0,0 +1,35 @@
+from .common import (
+    Seqlen,
+    HPUPagedAttentionMetadata,
+    trim_attn_metadata,
+    trim_seqlen_metadata,
+    _async_h2d_tensor_copy,
+)
+
+from .hpu import (
+    SUPPORTS_WINDOWING,
+    attention,
+    paged_attention,
+    paged_attention_mla,
+    set_block_mapping,
+)
+
+
+# KVCache needs `reshape_and_cache`, so ensure that it is defined already.
+from .kv_cache import KVCache, get_kv_scales, KVCompressCache
+
+__all__ = [
+    "attention",
+    "get_kv_scales",
+    "paged_attention",
+    "paged_attention_mla",
+    "set_block_mapping",
+    "SUPPORTS_WINDOWING",
+    "KVCache",
+    "KVCompressCache",
+    "Seqlen",
+    "HPUPagedAttentionMetadata",
+    "trim_seqlen_metadata",
+    "trim_attn_metadata",
+    "_async_h2d_tensor_copy",
+]
--- a/backends/gaudi/server/text_generation_server/layers/attention/common.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/common.py
@ -0,0 +1,186 @@
+from dataclasses import dataclass
+import torch
+from typing import Optional, List, Dict
+import collections
+import torch.nn.functional as F
+
+_TYPE_CACHE = {}
+
+
+@dataclass
+class HPUPagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+
+    block_list: Optional[torch.Tensor]
+    block_mapping: Optional[torch.Tensor]
+    block_usage: Optional[torch.Tensor]
+    block_groups: Optional[torch.Tensor]
+    attn_bias: Optional[torch.Tensor]
+    slots_in_window_mask: Optional[torch.Tensor] = None
+    block_list_in_window: Optional[torch.Tensor] = None
+    block_mapping_in_window: Optional[torch.Tensor] = None
+    block_usage_in_window: Optional[torch.Tensor] = None
+    block_groups_in_window: Optional[torch.Tensor] = None
+    attn_bias_in_window: Optional[torch.Tensor] = None
+
+
+def subtuple(
+    obj: object,
+    typename: str,
+    to_copy: List[str],
+    to_override: Optional[Dict[str, object]] = None,
+):
+    if obj is None:
+        return None
+    if to_override is None:
+        to_override = {}
+    fields = set(to_copy) | set(to_override.keys())
+    if isinstance(obj, dict):
+        values = {key: obj[key] for key in fields if key in obj}
+    else:
+        values = {f: to_override.get(f, getattr(obj, f)) for f in fields}
+    if typename not in _TYPE_CACHE:
+        _TYPE_CACHE[typename] = collections.namedtuple(typename, " ".join(fields))
+    return _TYPE_CACHE[typename](**values)
+
+
+def trim_attn_metadata(metadata: HPUPagedAttentionMetadata) -> object:
+    # NOTE(kzawora): To anyone working on this in the future:
+    # Trimming metadata is required when using HPUGraphs.
+    # Attention metadata is going to be hashed by PT bridge, and
+    # appropriate HPUGraphs will be matched based on all inputs' hash.
+
+    # Before you put more keys in here, make sure you know their
+    # value type and make sure you know how it's going to be hashed.
+    # You can find that information in input_hash function
+    # in habana_frameworks/torch/hpu/graphs.py. You can also hash
+    # it manually with torch.hpu.graphs.input_hash(attention_metadata)
+
+    # If you use primitive types here - they will get hashed based
+    # on their value. You *will* get lots of excessive graph captures
+    # (and an OOM eventually) if you decide to put something like
+    # seq_len int here.
+    # If you absolutely need a scalar, put it in a tensor. Tensors
+    # get hashed using their metadata, not their values:
+    # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
+    # input_hash(123) != input_hash(321)
+    # input_hash("abc") != input_hash("cba")
+    attention_metadata = subtuple(
+        metadata,
+        "TrimmedAttentionMetadata",
+        [
+            "block_list",
+            "block_mapping",
+            "block_usage",
+            "block_groups",
+            "attn_bias",
+            "slots_in_window_mask",
+            "block_list_in_window",
+            "block_mapping_in_window",
+            "block_usage_in_window",
+            "block_groups_in_window",
+            "attn_bias_in_window",
+        ],
+    )
+    return attention_metadata
+
+
+@dataclass
+class Seqlen:
+    input_lengths: torch.Tensor
+    attn_mask: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        input_lengths,
+    ):
+        self.input_lengths = input_lengths
+
+    def clamp(self, max):
+        # Flash decoding doesn't need to clamp
+        return self
+
+    def make_sliding_window_bias(
+        self,
+        seq_lens: List[int],
+        window_size: Optional[int],
+        dtype: torch.dtype,
+        padded_input_len: Optional[int],
+        padded_bs: Optional[int],
+    ) -> List[torch.Tensor]:
+        attn_biases = []
+        for seq_len in seq_lens:
+            if seq_len != 0:
+                tensor = torch.full(
+                    (1, seq_len, seq_len),
+                    dtype=dtype,
+                    fill_value=1,
+                )
+                shift = 0
+                mask = torch.tril(tensor, diagonal=shift).to(dtype)  # type: ignore
+                if window_size is not None:
+                    mask = torch.triu(mask, diagonal=shift - window_size + 1)
+                mask = F.pad(
+                    mask,
+                    (
+                        padded_input_len - seq_len,
+                        0,
+                        padded_input_len - seq_len,
+                        0,
+                        0,
+                        0,
+                    ),
+                    value=0,
+                )
+            else:
+                mask = torch.full(
+                    (1, padded_input_len, padded_input_len),
+                    dtype=dtype,
+                    fill_value=0,
+                )
+            attn_biases.append(mask)
+        attn_biases = torch.stack(attn_biases, dim=0)
+        return attn_biases.to(torch.bool)
+
+
+def _async_h2d_tensor_copy(source, device="hpu"):
+    if source is None:
+        return None
+    if source.device.type == "hpu":
+        return source
+    assert source.device.type == "cpu", "Source tensor is not present in host memory!"
+    target = torch.empty(source.shape, dtype=source.dtype, device=device)
+    target.copy_(source, non_blocking=True)
+    return target
+
+
+def trim_seqlen_metadata(metadata: Seqlen) -> object:
+    # NOTE(kzawora): To anyone working on this in the future:
+    # Trimming metadata is required when using HPUGraphs.
+    # Attention metadata is going to be hashed by PT bridge, and
+    # appropriate HPUGraphs will be matched based on all inputs' hash.
+
+    # Before you put more keys in here, make sure you know their
+    # value type and make sure you know how it's going to be hashed.
+    # You can find that information in input_hash function
+    # in habana_frameworks/torch/hpu/graphs.py. You can also hash
+    # it manually with torch.hpu.graphs.input_hash(attention_metadata)
+
+    # If you use primitive types here - they will get hashed based
+    # on their value. You *will* get lots of excessive graph captures
+    # (and an OOM eventually) if you decide to put something like
+    # seq_len int here.
+    # If you absolutely need a scalar, put it in a tensor. Tensors
+    # get hashed using their metadata, not their values:
+    # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321))
+    # input_hash(123) != input_hash(321)
+    # input_hash("abc") != input_hash("cba")
+    attention_metadata = subtuple(
+        metadata,
+        "TrimmedSeqlen",
+        [
+            "input_lengths",
+            "attn_mask",
+        ],
+    )
+    return attention_metadata
--- a/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/hpu.py
@ -0,0 +1,227 @@
+import torch
+from text_generation_server.layers.attention import Seqlen, HPUPagedAttentionMetadata
+from typing import Optional
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
+from vllm_hpu_extension import ops
+from vllm_hpu_extension.utils import Matmul
+from habana_frameworks.torch.hpex.kernels import FusedSDPA
+from vllm_hpu_extension.utils import ModuleFusedSDPA
+import os
+from text_generation_server.models.globals import BLOCK_SIZE
+import math
+
+SUPPORTS_WINDOWING = False
+
+
+class FP8Matmul(torch.nn.Module):
+
+    def __init__(self, scale_other):
+        super().__init__()
+        self.scale_input = torch.tensor(1.0, dtype=torch.bfloat16, device="hpu")
+        self.scale_other = scale_other
+
+    def quant_input(self, x, scale):
+        return torch.ops.hpu.cast_to_fp8_v2(
+            x, scale, False, False, torch.float8_e4m3fn
+        )[0]
+
+    def matmul_fp8(
+        self, x, other, out_dtype, scale_input_inv=None, scale_other_inv=None
+    ):
+        return torch.ops.hpu.fp8_gemm_v2(
+            A=x,
+            trans_A=False,
+            B=other,
+            trans_B=False,
+            D=None,
+            out_dtype=out_dtype,
+            A_scale_inv=scale_input_inv,
+            B_scale_inv=scale_other_inv,
+            bias=None,
+            accumulate=False,
+        )
+
+    def forward(self, input, other):
+        qinput = self.quant_input(input, self.scale_input)
+        qother = self.quant_input(other, self.scale_other)
+        output = self.matmul_fp8(
+            qinput,
+            qother,
+            out_dtype=torch.bfloat16,
+            scale_input_inv=1.0 / self.scale_input,
+            scale_other_inv=1.0 / self.scale_other,
+        )
+        return output
+
+
+class FetchFromCache(torch.nn.Module):
+
+    def __init__(self, scale_inv):
+        super().__init__()
+        self.scale_inv = scale_inv
+
+    def forward(self, cache, blocks):
+        if os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() == "true":
+            out = cache[: blocks.size(0)]
+        else:
+            out = cache.index_select(0, blocks)
+        if out.dtype == torch.float8_e4m3fn:
+            out = torch.ops.hpu.cast_from_fp8(out, self.scale_inv, torch.bfloat16)
+        return out
+
+
+def attention(
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
+):
+    fsdpa_op = ModuleFusedSDPA(FusedSDPA)
+    bs = seqlen.input_lengths.shape[0]
+    _, head_num, head_size = query.shape
+    _, kv_head_num, head_size = key.shape
+    query = query.view(bs, -1, head_num, head_size).transpose(1, 2)
+    key = key.view(bs, -1, kv_head_num, head_size).transpose(1, 2)
+    value = value.view(bs, -1, kv_head_num, head_size).transpose(1, 2)
+    attn_output = fsdpa_op(
+        query,
+        key,
+        value,
+        attn_mask=seqlen.attn_mask if window_size_left != -1 else None,
+        dropout_p=0.0,
+        is_causal=causal if window_size_left == -1 else False,
+        scale=softmax_scale,
+        softmax_mode="None",
+        recompute_mode=None,
+        valid_sequence_lengths=seqlen.input_lengths if window_size_left == -1 else None,
+        padding_side="left",
+    )
+    attn_output = attn_output.transpose(1, 2).squeeze(0).contiguous()
+    return attn_output
+
+
+def set_block_mapping(hpu_attention_meta: HPUPagedAttentionMetadata, batch_size):
+    block_mapping = torch.nn.functional.one_hot(
+        hpu_attention_meta.block_groups, num_classes=batch_size
+    )
+    dtype = hpu_attention_meta.block_usage.dtype
+    device = hpu_attention_meta.block_usage.device
+    mask = torch.arange(0, BLOCK_SIZE, device=device, dtype=torch.int32).unsqueeze(0)
+    mask = mask >= hpu_attention_meta.block_usage.unsqueeze(-1)
+    attn_bias = torch.zeros_like(mask, dtype=dtype).masked_fill_(mask, -math.inf)
+    hpu_attention_meta = hpu_attention_meta._replace(
+        attn_bias=attn_bias, block_mapping=block_mapping.to(dtype)
+    )
+    if hpu_attention_meta.block_groups_in_window is not None:
+        block_mapping = torch.nn.functional.one_hot(
+            hpu_attention_meta.block_groups_in_window, num_classes=batch_size
+        )
+        attn_bias = torch.log(hpu_attention_meta.slots_in_window_mask.float())
+        hpu_attention_meta = hpu_attention_meta._replace(
+            attn_bias_in_window=attn_bias,
+            block_mapping_in_window=block_mapping.to(dtype),
+        )
+    return hpu_attention_meta
+
+
+def paged_attention(
+    query: torch.Tensor,
+    kv_cache: KVCache,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    seqlen: Seqlen,
+    *,
+    kv_scales: KVScales,
+    softcap: Optional[float] = None,
+    hpu_attention_meta: HPUPagedAttentionMetadata,
+    window_size_left: int = -1,
+):
+    batch_size, head_num, head_size = query.shape
+    fp8_kv = kv_cache.dtype == torch.float8_e4m3fn
+    output = ops.flat_pa(
+        query=query.view(batch_size, 1, head_num * head_size),
+        key_cache=kv_cache.key,
+        value_cache=kv_cache.value,
+        block_list=(
+            hpu_attention_meta.block_list
+            if window_size_left == -1
+            else hpu_attention_meta.block_list_in_window
+        ),
+        block_mapping=(
+            hpu_attention_meta.block_mapping
+            if window_size_left == -1
+            else hpu_attention_meta.block_mapping_in_window
+        ),
+        block_bias=(
+            hpu_attention_meta.attn_bias
+            if window_size_left == -1
+            else hpu_attention_meta.attn_bias_in_window
+        ),
+        block_groups=(
+            hpu_attention_meta.block_groups
+            if window_size_left == -1
+            else hpu_attention_meta.block_groups_in_window
+        ),
+        block_size=BLOCK_SIZE,
+        scale=softmax_scale,
+        matmul_qk_op=FP8Matmul(kv_scales.key_scale) if fp8_kv else Matmul(),
+        matmul_av_op=FP8Matmul(kv_scales.value_scale) if fp8_kv else Matmul(),
+        batch2block_matmul_op=Matmul(),
+        block2batch_matmul_op=Matmul(),
+        keys_fetch_func=FetchFromCache(1.0 / kv_scales.key_scale_cpu),
+        values_fetch_func=FetchFromCache(1.0 / kv_scales.value_scale_cpu),
+    )
+    # Reshape the output tensor.
+    return output.view(batch_size, head_num, head_size)
+
+
+def paged_attention_mla(
+    query: torch.Tensor,
+    kv_cache: KVCache,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    seqlen: Seqlen,
+    *,
+    kv_scales: KVScales,
+    softcap: Optional[float] = None,
+    hpu_attention_meta: HPUPagedAttentionMetadata,
+    kv_lora_rank: int = 0,
+):
+    batch_size, head_num, head_size = query.shape
+    fp8_kv = kv_cache.dtype == torch.float8_e4m3fn
+    output = ops.flat_pa_mla(
+        query=query,
+        key_cache=kv_cache.key,
+        value_cache=None,
+        block_list=hpu_attention_meta.block_list,
+        block_mapping=hpu_attention_meta.block_mapping,
+        block_bias=hpu_attention_meta.attn_bias,
+        block_groups=hpu_attention_meta.block_groups,
+        block_size=BLOCK_SIZE,
+        scale=softmax_scale,
+        matmul_qk_op=FP8Matmul(kv_scales.key_scale) if fp8_kv else Matmul(),
+        matmul_av_op=FP8Matmul(kv_scales.value_scale) if fp8_kv else Matmul(),
+        batch2block_matmul_op=Matmul(),
+        block2batch_matmul_op=Matmul(),
+        keys_fetch_func=FetchFromCache(1.0 / kv_scales.key_scale_cpu),
+        values_fetch_func=None,
+        kv_lora_rank=kv_lora_rank,
+    )
+    # Reshape the output tensor.
+    return output.view(batch_size, head_num, -1)
+
+
+__all__ = [
+    "SUPPORTS_WINDOWING",
+    "attention",
+    "paged_attention",
+    "paged_attention_mla",
+    "set_block_mapping",
+]
--- a/backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
+++ b/backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py
@ -0,0 +1,205 @@
+from typing import Tuple
+from dataclasses import dataclass, field
+
+import torch
+
+from text_generation_server.models.globals import BLOCK_SIZE
+from text_generation_server.utils.weights import Weights
+
+
+@dataclass
+class KVScales:
+    """
+    Key-value scales for FP8 KV cache.
+
+    This data class stores key and value scales both as a GPU tensor and
+    as a GPU float. This inconvenience is necessary because some functions
+    (e.g. scaling kernels) take scales as a GPU tensor, whereas others
+    (e.g. flashinfer) take scales as a CPU scalar.
+    """
+
+    key_scale: torch.Tensor
+    value_scale: torch.Tensor
+    key_scale_cpu: float = field(init=False)
+    value_scale_cpu: float = field(init=False)
+
+    def __post_init__(self):
+        if self.key_scale.numel() != 1 or self.value_scale.numel() != 1:
+            raise ValueError("Key and value scales must be scalar tensors.")
+
+        self.key_scale_cpu = self.key_scale.item()
+        self.value_scale_cpu = self.value_scale.item()
+
+
+class KVCache:
+    """
+    Key-value cache for attention layers.
+    """
+
+    kv_cache: Tuple[torch.Tensor, torch.Tensor]
+
+    def __init__(
+        self,
+        *,
+        num_blocks: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        """Construct the key-value cache for a layer."""
+        ## TODO FP8 kv cache support
+        if dtype is torch.float8_e5m2:
+            raise ValueError("torch.float8_e5m2 is not supported in hpu. ")
+
+        self.kv_cache = (
+            torch.zeros(
+                (num_blocks * BLOCK_SIZE, num_heads, head_size),
+                dtype=dtype,
+                device=device,
+            ),
+            torch.zeros(
+                (num_blocks * BLOCK_SIZE, num_heads, head_size),
+                dtype=dtype,
+                device=device,
+            ),
+        )
+
+    @property
+    def dtype(self):
+        """Get the data type of the cache."""
+        return self.kv_cache[0].dtype
+
+    @property
+    def key(self):
+        """Get the key cache."""
+
+        return self.kv_cache[0]
+
+    @property
+    def value(self):
+        """Get the value cache."""
+
+        return self.kv_cache[1]
+
+    def store(
+        self,
+        *,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        slots: torch.Tensor,
+        kv_scales: KVScales,
+    ):
+        """Store the key and value at the given slots."""
+        ## TODO FP8 kv cache support
+
+        key_cache = self.kv_cache[0]
+        value_cache = self.kv_cache[1]
+
+        paged_reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            slots,
+            kv_scales.key_scale,
+            kv_scales.value_scale,
+        )
+
+
+class KVCompressCache(KVCache):
+    """
+    Key-value cache for attention layers.
+    """
+
+    kv_cache: torch.Tensor
+
+    def __init__(
+        self,
+        *,
+        num_blocks: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        """Construct the key-value cache for a layer."""
+        ## TODO FP8 kv cache support
+        if dtype is torch.float8_e5m2:
+            raise ValueError("torch.float8_e5m2 is not supported in hpu. ")
+
+        self.kv_cache = torch.zeros(
+            (num_blocks * BLOCK_SIZE, 1, head_size),
+            dtype=dtype,
+            device=device,
+        )
+
+    @property
+    def dtype(self):
+        """Get the data type of the cache."""
+        return self.kv_cache.dtype
+
+    @property
+    def key(self):
+        """Get the key cache."""
+
+        return self.kv_cache
+
+    @property
+    def value(self):
+        """Get the value cache."""
+
+        return self.kv_cache
+
+    def store(
+        self,
+        *,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        slots: torch.Tensor,
+        kv_scales: KVScales,
+    ):
+        """Store the key and value at the given slots."""
+        ## TODO FP8 kv cache support
+        if self.kv_cache.dtype == torch.float8_e4m3fn:
+            key = torch.ops.hpu.cast_to_fp8_v2(
+                key, kv_scales.key_scale, False, False, torch.float8_e4m3fn
+            )[0]
+        self.kv_cache.index_copy_(0, slots, key)
+
+
+def paged_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+):
+    if key_cache.dtype == torch.float8_e4m3fn:
+        key = torch.ops.hpu.cast_to_fp8_v2(
+            key, k_scale, False, False, torch.float8_e4m3fn
+        )[0]
+        value = torch.ops.hpu.cast_to_fp8_v2(
+            value, v_scale, False, False, torch.float8_e4m3fn
+        )[0]
+    key_cache.index_copy_(0, slots, key)
+    value_cache.index_copy_(0, slots, value)
+
+
+def get_kv_scales(weights: Weights, prefix: str) -> KVScales:
+    """Load KV cache scales."""
+
+    key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device)
+    value_scale = key_scale
+    if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor(
+        f"{prefix}.v_scale"
+    ):
+        key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float()
+        value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float()
+    elif weights.has_tensor(f"{prefix}.kv_scale"):
+        # Fall back to older more coarse-grained scale when available.
+        key_scale = weights.get_tensor(f"{prefix}.kv_scale").float()
+        value_scale = key_scale
+
+    return KVScales(key_scale=key_scale, value_scale=value_scale)
--- a/backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
+++ b/backends/gaudi/server/text_generation_server/layers/awq/conversion_utils.py
@ -0,0 +1,97 @@
+import torch
+from typing import List
+
+
+AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
+REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def pack(imatrix: torch.Tensor, direction: str = "column"):
+    """
+    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
+    Args:
+        imatrix (torch.Tensor): matrix of integers
+        direction (str): direction of packing, either "column" or "row"
+    Returns:
+        qmatrix (torch.Tensor): packed matrix of integers
+    """
+    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)
+
+    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
+
+    if direction == "column":
+        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
+        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)
+
+    elif direction == "row":
+        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
+        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)
+
+    qmatrix = qmatrix.to(torch.int32)
+
+    return qmatrix
+
+
+def unpack(qmatrix: torch.Tensor, direction: str = "column"):
+    """
+    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
+    Args:
+        qmatrix (torch.Tensor): matrix of packed integers
+        direction (str): direction of unpacking, either "column" or "row"
+    Returns:
+        imatrix (torch.Tensor): matrix of integers
+    """
+    shifts = torch.arange(0, 32, 4, device=qmatrix.device)
+
+    if direction == "column":
+        imatrix = torch.bitwise_right_shift(
+            qmatrix[:, :, None], shifts[None, None, :]
+        ).view(qmatrix.shape[0], -1)
+
+    elif direction == "row":
+        imatrix = torch.bitwise_right_shift(
+            qmatrix[:, None, :], shifts[None, :, None]
+        ).view(-1, qmatrix.shape[-1])
+
+    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
+
+    return imatrix
+
+
+def apply_order(
+    imatrix: torch.Tensor,
+    direction: str = "column",
+    order: List[int] = AWQ_PACK_ORDER,
+):
+    """
+    Applies the order to a 4-bit integer matrix.
+    Args:
+        imatrix (torch.Tensor): matrix of integers
+        direction (str): direction of applying order, either "column" or "row"
+        order (List[int]): order to apply, default is AWQ_PACK_ORDER
+    Returns:
+        imatrix (torch.Tensor): matrix of integers
+    """
+    if direction == "column":
+        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
+    elif direction == "row":
+        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)
+
+    return imatrix
+
+
+def fast_awq_to_gptq(qweight, qzeros):
+    # awq uses column packing for both weights and zeros
+    izeros = unpack(qzeros, direction="column")
+    iweights = unpack(qweight, direction="column")
+
+    # Reverse the order of the iweight and izeros tensors
+    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
+    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
+    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
+    izeros = izeros - 1
+    # exllama uses row packing for weights and column packing for zeros
+    qzeros = pack(izeros, direction="column")
+    qweight = pack(iweights, direction="row")
+
+    return qweight, qzeros
--- a/backends/gaudi/server/text_generation_server/layers/awq/quantize/init.py
+++ b/backends/gaudi/server/text_generation_server/layers/awq/quantize/init.py
@ -0,0 +1,3 @@
+from .hpu import WQLinear
+
+__all__ = ["WQLinear"]
--- a/backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
+++ b/backends/gaudi/server/text_generation_server/layers/awq/quantize/hpu.py
@ -0,0 +1,134 @@
+from typing import Optional
+import torch
+import torch.nn as nn
+
+try:
+    import habana_frameworks.torch.hpu  # noqa: F401
+
+    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
+except Exception as e:
+    hpu_import_exception = e
+
+    def error_raiser_hpu(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
+        )
+
+    convert_from_uint4 = error_raiser_hpu
+
+AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int):
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+
+    # unpacking columnwise
+    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
+        torch.int8  # smallest dtype available
+    )
+    iweights = iweights.view(iweights.shape[0], -1)
+
+    # unpacking columnwise
+    if qzeros is not None:
+        izeros = torch.bitwise_right_shift(
+            qzeros[:, :, None], shifts[None, None, :]
+        ).to(
+            torch.int8  # smallest dtype available
+        )
+        izeros = izeros.view(izeros.shape[0], -1)
+    else:
+        izeros = qzeros
+
+    return iweights, izeros
+
+
+def reverse_awq_order(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
+    reverse_order_tensor = torch.arange(
+        iweights.shape[-1],
+        dtype=torch.int32,
+        device=izeros.device,
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+
+    if izeros is not None:
+        izeros = izeros[:, reverse_order_tensor]
+    iweights = iweights[:, reverse_order_tensor]
+
+    return iweights, izeros
+
+
+def unpack_weight_and_zeros(qweight, qzeros, bits):
+    # Unpack the qweight and qzeros tensors
+    iweight, izeros = unpack_awq(qweight, qzeros, bits)
+    # Reverse the order of the iweight and izeros tensors
+    iweight, izeros = reverse_awq_order(iweight, izeros, bits)
+
+    # overflow checks
+    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
+    izeros = torch.bitwise_and(izeros, (2**bits) - 1)
+
+    return iweight, izeros
+
+
+def pack_tensor(input, bits=4):
+    normal = input.to(torch.int32)
+    q = torch.zeros(
+        (normal.shape[0], normal.shape[1] // 32 * bits),
+        dtype=torch.int32,
+        device=input.device,
+    )
+    i = 0
+    col = 0
+    while col < q.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            q[:, col] |= normal[:, j] << (bits * (j - i))
+        i += 32 // bits
+        col += 1
+    q = q.to(torch.int32)
+    return q
+
+
+class WQLinear(nn.Module):
+    def __init__(
+        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
+    ):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.bias = bias
+        self._preprocessing()
+
+    def _preprocessing(self):
+        device = self.qweight.device
+        weight, zeros = unpack_weight_and_zeros(
+            self.qweight.cpu(), self.qzeros.cpu(), self.w_bit
+        )
+        self.qweight = pack_tensor(weight).to(device)
+        self.qzeros = pack_tensor(zeros).to(device)
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        x = x.reshape(-1, x.shape[-1])
+        weights = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
+        outputs = torch.matmul(x, weights)
+
+        outputs = outputs + self.bias if self.bias is not None else outputs
+        outputs = outputs.reshape(out_shape)
+        return outputs
--- a/backends/gaudi/server/text_generation_server/layers/bnb.py
+++ b/backends/gaudi/server/text_generation_server/layers/bnb.py
@ -0,0 +1,124 @@
+from dataclasses import dataclass
+
+import bitsandbytes as bnb
+import torch
+from bitsandbytes.nn import Int8Params, Params4bit
+from text_generation_server.utils.weights import UnquantizedWeight
+
+
+@dataclass
+class BNBWeight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear8bitLt(self.weight, bias, has_fp16_weights=False, threshold=6.0)
+
+
+class Linear8bitLt(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        has_fp16_weights=True,
+        memory_efficient_backward=False,
+        threshold=0.0,
+        index=None,
+    ):
+        super().__init__()
+        assert (
+            not memory_efficient_backward
+        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
+        self.state = bnb.MatmulLtState()
+        self.index = index
+
+        # Necessary for stacked layers
+        self.state.threshold = threshold
+        self.state.has_fp16_weights = has_fp16_weights
+        self.state.memory_efficient_backward = memory_efficient_backward
+        if threshold > 0.0 and not has_fp16_weights:
+            self.state.use_pool = True
+
+        self.weight = Int8Params(
+            weight.data,
+            has_fp16_weights=has_fp16_weights,
+            requires_grad=has_fp16_weights,
+        )
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def init_8bit_state(self):
+        self.state.CB = self.weight.CB
+        self.state.SCB = self.weight.SCB
+        self.weight.CB = None
+        self.weight.SCB = None
+
+    def forward(self, x: torch.Tensor):
+        self.state.is_training = self.training
+        if self.weight.CB is not None:
+            self.init_8bit_state()
+
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
+
+        if not self.state.has_fp16_weights:
+            if self.state.CB is not None and self.state.CxB is not None:
+                # we converted 8-bit row major to turing/ampere format in the first inference pass
+                # we no longer need the row-major weight
+                del self.state.CB
+                self.weight.data = self.state.CxB
+        return out
+
+
+@dataclass
+class BNBFP4Weight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear4bit(self.weight, bias, quant_type="fp4")
+
+
+@dataclass
+class BNBNF4Weight(UnquantizedWeight):
+    weight: torch.Tensor
+
+    def get_linear(self, bias: torch.Tensor):
+        return Linear4bit(self.weight, bias, quant_type="nf4")
+
+
+class Linear4bit(torch.nn.Module):
+    def __init__(self, weight, bias, quant_type):
+        super().__init__()
+        self.weight = Params4bit(
+            weight.data,
+            requires_grad=False,
+            compress_statistics=True,
+            quant_type=quant_type,
+        )
+        self.compute_dtype = None
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def forward(self, x: torch.Tensor):
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        if getattr(self.weight, "quant_state", None) is None:
+            print(
+                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
+            )
+        inp_dtype = x.dtype
+        if self.compute_dtype is not None:
+            x = x.to(self.compute_dtype)
+
+        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+        out = bnb.matmul_4bit(
+            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
+        )
+
+        out = out.to(inp_dtype)
+
+        return out
--- a/backends/gaudi/server/text_generation_server/layers/compressed_tensors/init.py
+++ b/backends/gaudi/server/text_generation_server/layers/compressed_tensors/init.py
@ -0,0 +1,3 @@
+from .loader import CompressedTensorsLoader
+
+__all__ = ["CompressedTensorsLoader"]
--- a/backends/gaudi/server/text_generation_server/layers/compressed_tensors/loader.py
+++ b/backends/gaudi/server/text_generation_server/layers/compressed_tensors/loader.py
@ -0,0 +1,169 @@
+from typing import Any, Dict, List, Union
+
+from compressed_tensors import QuantizationConfig, QuantizationStatus
+from compressed_tensors.config import CompressionFormat
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+    QuantizationType,
+    find_name_or_class_matches,
+)
+from loguru import logger
+from pydantic import ValidationError
+from torch import nn
+
+from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import (
+    DefaultWeightsLoader,
+    UnquantizedWeight,
+    Weights,
+    WeightsLoader,
+)
+
+# compressed-tensors can match modules as quantization targets. However,
+# they need to be objects rather than classes or class names. Since we
+# need to match `Linear` targets, make an instance that can be re-used.
+_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
+
+
+class CompressedTensorsLoader(WeightsLoader):
+    """Loader for checkpoints stored in the compressed-tensors format."""
+
+    def __init__(self, config: Dict[str, Any]):
+        quantization_config_raw = config.get("quantization_config")
+        if quantization_config_raw is None:
+            # `compression_config` was renamed to `quantization_config`; support
+            # retained for backward compatibility.
+            quantization_config_raw = config.get("compression_config")
+        if quantization_config_raw is None:
+            raise ValueError(
+                "Checkpoint does not have compressed-tensors configuration"
+            )
+
+        try:
+            quantization_config = QuantizationConfig.model_validate(
+                quantization_config_raw
+            )
+        except ValidationError as e:
+            raise ValueError("Cannot parse compressed-tensors configuration") from e
+
+        if quantization_config.quantization_status not in (
+            QuantizationStatus.COMPRESSED,
+            QuantizationStatus.FROZEN,
+        ):
+            raise ValueError(
+                f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
+            )
+
+        self.ignore = (
+            quantization_config.ignore if quantization_config.ignore is not None else []
+        )
+        self.loaders = self._get_target_loaders(quantization_config)
+
+        for target, loader in self.loaders.items():
+            log_once(
+                logger.info,
+                f"Using {loader} for compressed-tensors target '{target}'",
+            )
+
+    def get_weights(self, weights: Weights, prefix: str):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights(weights, prefix)
+
+    def get_weights_col_packed(
+        self,
+        weights: "Weights",
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights_col_packed(weights, prefix, block_sizes)
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        loader = self._lookup_loader(prefixes[0])
+        return loader.get_multi_weights_col(weights, prefixes, dim)
+
+    def get_multi_weights(self, weights: Weights, prefixes: List[str], dim: int):
+        loader = self._lookup_loader(prefixes[0])
+        return loader.get_multi_weights(weights, prefixes, dim)
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        loader = self._lookup_loader(prefix)
+        return loader.get_weights_row(weights, prefix)
+
+    def _get_target_loaders(
+        self, quantization_config: QuantizationConfig
+    ) -> Dict[str, WeightsLoader]:
+        """
+        A compressed-tensors checkpoint can use different quantizations
+        for different targets. This method returns a dictionary with a
+        loader per target.
+        """
+
+        loaders: Dict[str, WeightsLoader] = {}
+
+        format = quantization_config.format
+
+        for group_name, group in quantization_config.config_groups.items():
+            # The group configuration can be a string, but does that ever
+            # happen in a serialized quantization config?
+            assert isinstance(group, QuantizationScheme)
+
+            loader = self._create_loader_for_group(format, group_name, group)
+
+            # A quantized parameter group can have multiple targets, add the
+            # loader for all the targets.
+            for target in group.targets:
+                if target in loaders:
+                    raise ValueError(
+                        f"Target '{target} has multiple configured loaders'"
+                    )
+                loaders[target] = loader
+
+        return loaders
+
+    def _create_loader_for_group(
+        self, format: str, group_name: str, group: QuantizationScheme
+    ) -> WeightsLoader:
+        """
+        Find and create a loader for the group with the given quantization
+        scheme.
+        """
+        # NOTE: we ignore group.output_activations because we don't support
+        #       output quantization yet.
+
+        input_activations = group.input_activations
+        weights = group.weights
+        if (
+            format
+            in {
+                CompressionFormat.float_quantized.value,
+                CompressionFormat.naive_quantized.value,
+            }
+            and weights is not None
+            and weights.type == QuantizationType.FLOAT
+            and weights.num_bits == 8
+        ):
+            # FP W8A8 or W8A16.
+            return W8ANFpLoader(input_activations=input_activations, weights=weights)
+        else:
+            raise ValueError(
+                f"Group '{group_name}' has unsupported compressed-tensors configurtion"
+            )
+
+    def _lookup_loader(self, prefix: str) -> WeightsLoader:
+        """
+        Look up the loader to use for a given parameter name (prefix).
+        """
+
+        if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
+            return DefaultWeightsLoader(UnquantizedWeight)
+
+        # We currently only handle linear layers, so unconditionally pass
+        # a `Linear` instance.
+        targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
+        if len(targets) == 0:
+            raise ValueError(
+                f"Cannot find compressed-tensors target for prefix: {prefix}"
+            )
+        return self.loaders[targets[0]]
--- a/backends/gaudi/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
+++ b/backends/gaudi/server/text_generation_server/layers/compressed_tensors/w8an_fp.py
@ -0,0 +1,253 @@
+from typing import List, Optional, Union
+
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationType
+
+from text_generation_server.layers.fp8 import (
+    Fp8Weight,
+    _load_scalar_or_matrix_scale,
+    requantize_with_max_scale,
+)
+from text_generation_server.utils.weights import Weights, WeightsLoader
+
+
+class W8ANFpLoader(WeightsLoader):
+    """
+    Loader for W8A8/W8A16 FP compressed-tensors parameters.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_activations: Optional[QuantizationArgs],
+        weights: QuantizationArgs,
+    ):
+        assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
+
+        # We ignore the `strategy` option which sets the scales to be
+        # per-tensor, per-channel or per-token. What scales are supported
+        # is dependent on the kernels used (e.g. cutlass can do tokenwise,
+        # Torch cannot, and FP8-Marlin does not quantize inputs at all).
+        # So, instead we try to use the best-possible configuration.
+
+        self.load_weight_scale = not weights.dynamic
+        self.load_input_scale = (
+            input_activations is not None and not input_activations.dynamic
+        )
+        self.force_w8a16 = (
+            input_activations is not None and input_activations.num_bits == 16
+        )
+
+    def __str__(self) -> str:
+        def scale_to_str(scale):
+            return "static" if scale else "dynamic"
+
+        quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
+
+        return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        w = weights.get_tensor(f"{prefix}.weight")
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+            logical_widths = [w.shape[0]]
+            w, weight_scale = requantize_with_max_scale(
+                w,
+                weight_scale.unsqueeze(-1).to(weights.device),
+                logical_widths,
+                weights.dtype,
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(
+                f"{prefix}.input_scale", to_dtype=False
+            ).reshape(-1)
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        w = weights.get_packed_sharded(
+            f"{prefix}.weight", dim=0, block_sizes=block_sizes
+        )
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            if weight_scale.numel() > 1:
+                weight_scale = weights.get_packed_sharded(
+                    f"{prefix}.weight_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
+            logical_widths = [w.shape[0]]
+            w, weight_scale = requantize_with_max_scale(
+                w,
+                weight_scale.unsqueeze(-1).to(weights.device),
+                logical_widths,
+                weights.dtype,
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+            if input_scale.numel() > 1:
+                input_scale = weights.get_packed_sharded(
+                    f"{prefix}.input_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            input_scale = input_scale.reshape(-1).max()
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [
+            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
+        ]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+            ]
+            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
+            logical_widths = [x[0] for x in shapes]
+            w, weight_scale = requantize_with_max_scale(
+                w,
+                weight_scale.unsqueeze(-1).to(weights.device),
+                logical_widths,
+                weights.dtype,
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_multi_weights(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [weights.get_tensor(f"{p}.weight", to_device=False) for p in prefixes]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        weight_scale = None
+
+        if self.load_weight_scale:
+            weight_scale = [
+                weights.get_tensor(f"{p}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(shape[0])
+                for p, shape in zip(prefixes, shapes)
+            ]
+            weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
+            logical_widths = [x[0] for x in shapes]
+            w, weight_scale = requantize_with_max_scale(
+                w,
+                weight_scale.unsqueeze(-1).to(weights.device),
+                logical_widths,
+                weights.dtype,
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = [
+                weights.get_tensor(f"{p}.input_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(shape[0])
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        w = weights.get_sharded(f"{prefix}.weight", dim=1)
+        weight_scale = None
+        if self.load_weight_scale:
+            weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
+            logical_widths = [w.shape[0]]
+            w, weight_scale = requantize_with_max_scale(
+                w,
+                weight_scale.unsqueeze(-1).to(weights.device),
+                logical_widths,
+                weights.dtype,
+            )
+
+        input_scale = None
+        if self.load_input_scale:
+            input_scale = weights.get_tensor(
+                f"{prefix}.input_scale", to_dtype=False
+            ).reshape(-1)
+
+        return Fp8Weight(
+            weight=w,
+            weight_scale=weight_scale,
+            input_scale=input_scale,
+            dtype=weights.dtype,
+            force_w8a16=self.force_w8a16,
+        )
--- a/backends/gaudi/server/text_generation_server/layers/conv.py
+++ b/backends/gaudi/server/text_generation_server/layers/conv.py
@ -0,0 +1,41 @@
+from accelerate import init_empty_weights
+import torch
+
+
+@classmethod
+def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = torch.nn.Parameter(bias)
+    return conv2d
+
+
+@classmethod
+def load_conv2d_no_bias(
+    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
+):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = None
+    return conv2d
+
+
+torch.nn.Conv2d.load = load_conv2d
+torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
--- a/backends/gaudi/server/text_generation_server/layers/exl2.py
+++ b/backends/gaudi/server/text_generation_server/layers/exl2.py
@ -0,0 +1,78 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import torch
+from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
+
+
+@dataclass
+class Exl2Weight(Weight):
+    """
+    Exllama2 exl2 quantized weights.
+    """
+
+    q_weight: torch.Tensor
+    q_scale: torch.Tensor
+    q_invperm: torch.Tensor
+    q_scale_max: torch.Tensor
+    q_groups: torch.Tensor
+
+    def __post_init__(self):
+        self.q_scale_max /= 256
+        self.q_invperm = self.q_invperm.short()
+
+    @property
+    def device(self) -> torch.device:
+        return self.q_weight.device
+
+    def get_linear(self, bias: torch.Tensor):
+        from text_generation_server.layers.gptq import ExllamaQuantLinear
+
+        return ExllamaQuantLinear(self, bias)
+
+
+class Exl2WeightsLoader(WeightsLoader):
+    """Loader for exl2-quantized weights."""
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        """
+        Get weights at the given prefix and apply without tensor paralllism.
+        """
+        try:
+            q_weight = weights.get_tensor(f"{prefix}.q_weight")
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `exl2`-quantized weight, make sure the model is already quantized."
+            )
+
+        q_scale = weights.get_tensor(f"{prefix}.q_scale")
+        q_invperm = weights.get_tensor(f"{prefix}.q_invperm")
+        q_scale_max = weights.get_tensor(f"{prefix}.q_scale_max")
+        q_groups = weights.get_tensor(f"{prefix}.q_groups")
+
+        return Exl2Weight(
+            q_weight=q_weight,
+            q_scale=q_scale,
+            q_invperm=q_invperm,
+            q_scale_max=q_scale_max,
+            q_groups=q_groups,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        raise RuntimeError("Column-packed weights are not supported for exl")
+
+    def get_weights_col(self, weights: Weights, prefix: str):
+        # Sharding is not yet supported, so we return the weights as-is.
+        return self.get_weights(weights, prefix)
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        raise ValueError("get_multi_weights_col is not supported for exl2")
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        # Sharding is not yet supported, so we return the weights as-is.
+        return self.get_weights(weights, prefix)
--- a/backends/gaudi/server/text_generation_server/layers/fp8.py
+++ b/backends/gaudi/server/text_generation_server/layers/fp8.py
@ -0,0 +1,655 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple, Type, Union, List
+
+import torch
+
+from text_generation_server.utils.weights import (
+    Weight,
+    WeightsLoader,
+    UnquantizedWeight,
+    Weights,
+)
+
+from vllm_hpu_extension.ops import scaled_fp8_quant
+from vllm_hpu_extension.scales import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2
+
+quant_dtype: torch.dtype = torch.float8_e4m3fn
+FP8_MAX = torch.finfo(torch.float8_e4m3fn).max
+if is_hpu_gaudi2():
+    FP8_MAX = torch.finfo(torch.float8_e4m3fnuz).max
+
+
+def pad_weight(weight, block_size):
+    """Pads a matrix to make its dimensions multiples of block_size."""
+    M, N = weight.shape[-2:]
+    block_size_m, block_size_n = block_size
+    pad_M = (block_size_m - M % block_size_m) % block_size_m
+    pad_N = (block_size_n - N % block_size_n) % block_size_n
+
+    if pad_M == 0 and pad_N == 0:
+        return weight, M, N  # No padding needed
+    padded_weight = torch.nn.functional.pad(
+        weight, (0, pad_N, 0, pad_M), mode="constant", value=0
+    )
+    return padded_weight, M, N  # Return original dimensions for unpadding
+
+
+def unpad_weight(weight, original_M, original_N, keep_first_dim=False):
+    """Removes padding from the matrix to restore its original shape."""
+    if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N):
+        return weight
+    if keep_first_dim:
+        return weight[:, :original_M, :original_N]
+    else:
+        return weight[:original_M, :original_N]
+
+
+def pad_block_fp8_weight_naive(weight, weight_scale, block_size):
+
+    assert len(block_size) == 2
+
+    block_size_m, block_size_n = block_size
+    weight_scale_m, weight_scale_n = weight_scale.shape[-2:]
+
+    weight, orig_M, orig_N = pad_weight(weight, block_size)
+    M, N = weight.shape[-2:]
+
+    assert weight_scale_m == M // block_size_m
+    assert weight_scale_n == N // block_size_n
+
+    return weight, orig_M, orig_N
+
+
+def dynamic_quant(data, single_scale=False):
+    if single_scale:
+        scale = ((torch.abs(data)).max() + 1e-8) / FP8_MAX
+    else:
+        scale = ((torch.abs(data)).max(dim=-1).values + 1e-8) / FP8_MAX
+        scale = scale.unsqueeze(-1)
+    data_fp8 = torch.ops.hpu.cast_to_fp8_v2(
+        data, 1.0 / scale, False, False, torch.float8_e4m3fn
+    )[0]
+    return data_fp8, scale.float()
+
+
+def dequant_block_fp8_weight_naive(
+    weight,
+    weight_scale,
+    block_size,
+    dtype=torch.bfloat16,
+    original_M=None,
+    original_N=None,
+    do_unpad=False,
+):
+    if weight_scale is None:
+        return weight
+    assert len(block_size) == 2
+
+    weight_shape_len = len(weight.shape)
+
+    block_size_m, block_size_n = block_size
+
+    # mul scale
+    if weight_shape_len == 2:
+        weight_scale_m, weight_scale_n = weight_scale.shape
+        weight_scale = weight_scale.view(weight_scale_m, 1, weight_scale_n, 1)
+        weight = weight.view(weight_scale_m, block_size_m, weight_scale_n, block_size_n)
+        if is_hpu_gaudi2():
+            fake_weight = weight.cpu().to(dtype).to(weight.device)
+            dequant_weight = fake_weight * weight_scale.to(dtype)
+        else:
+            dequant_weight = weight.to(dtype) * weight_scale.to(dtype)
+        dequant_weight = dequant_weight.view(
+            weight_scale_m * block_size_m, weight_scale_n * block_size_n
+        )
+        keep_first_dim = False
+    elif weight_shape_len == 3:
+        fd, weight_scale_m, weight_scale_n = weight_scale.shape
+        weight_scale = weight_scale.view(fd, weight_scale_m, 1, weight_scale_n, 1)
+        weight = weight.view(
+            fd, weight_scale_m, block_size_m, weight_scale_n, block_size_n
+        )
+        if is_hpu_gaudi2():
+            fake_weight = weight.cpu().to(dtype).to(weight.device)
+            dequant_weight = fake_weight * weight_scale.to(dtype)
+        else:
+            dequant_weight = weight.to(dtype) * weight_scale.to(dtype)
+        dequant_weight = dequant_weight.view(
+            fd, weight_scale_m * block_size_m, weight_scale_n * block_size_n
+        )
+        keep_first_dim = True
+    else:
+        raise ValueError("Only support original weight shape is either 2 or 3")
+
+    if do_unpad:
+        dequant_weight = unpad_weight(
+            dequant_weight, original_M, original_N, keep_first_dim=keep_first_dim
+        )
+
+    return dequant_weight
+
+
+def apply_block_fp8_linear_hpu_dynamic(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    x_fp8, x_scale = dynamic_quant(input_2d)
+
+    output = torch.ops.hpu.fp8_gemm_v2(
+        x_fp8,
+        False,
+        weight,
+        True,
+        None,
+        torch.bfloat16,
+        x_scale,
+        weight_scale,
+        None,
+        False,
+    )
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
+    """
+    Return an FP8 linear `Module` that is compatible with the current system.
+    """
+    # On other systems let Torch decide if the hardware supports FP8.
+    return Fp8Linear
+
+
+def normalize_e4m3fn_to_native_float8(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    return weight, weight_scale, input_scale
+
+
+def per_tensor_dequantize(
+    tensor: torch.Tensor,
+    inv_scale: Union[float, torch.Tensor],
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    device = tensor.device
+    dtype = torch.bfloat16
+    if is_hpu_gaudi2():
+        # dequant on cpu to avoid nan on gaudi2
+        tensor = tensor.to("cpu")
+
+    fake_qweight = tensor.to(dtype).to(device)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def requantize_with_max_scale(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    logical_widths: int,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    if is_hpu_gaudi2():
+        max_w_scale = max_w_scale * get_hpu_gaudi2_scale_factor()
+
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_dq = per_tensor_dequantize(
+            weight[start:end, :], weight_scale[start:end, :], dtype
+        )
+        weight[start:end, :], max_w_scale_normalized = fp8_quantize(
+            weight_dq, max_w_scale
+        )
+        start = end
+
+    return weight, max_w_scale_normalized
+
+
+def fp8_quantize(
+    weight: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    scale_upper_bound: Optional[torch.Tensor] = None,
+    qdtype: torch.dtype = torch.float8_e4m3fn,
+    scalar: bool = False,
+):
+    """
+    This function returns a reciprocal of the scale, so that a tensor can be unscaled
+    by multiplying it with the returned scale. If a scale is given through the `scale`
+    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
+    be used without modification).
+    """
+    shape = weight.shape
+    qweight, scale = scaled_fp8_quant(
+        weight.reshape(-1, shape[-1]),
+        scale=scale,
+        scale_ub=scale_upper_bound,
+        # TODO: don't do this when we have to use the Torch kernel.
+        use_per_token_if_dynamic=not scalar,
+    )
+
+    return qweight.reshape(shape), scale
+
+
+class HybridFP8UnquantLoader(WeightsLoader):
+    """Weight loader that loads FP8 and unquantized Torch tensors."""
+
+    def __init__(
+        self,
+        activation_scale_ub: Optional[float],
+        to_fp8: bool,
+        weight_block_size: Optional[List[int]] = None,
+    ):
+        self.activation_scale_ub = activation_scale_ub
+        self.to_fp8 = to_fp8
+        self.weight_block_size = weight_block_size
+
+    def get_weights(self, weights: "Weights", prefix: str):
+        w = weights.get_tensor(f"{prefix}.weight")
+
+        if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = weights.get_tensor(f"{prefix}.weight_scale_inv")
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+            # FP8 branch
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+            scale = scale.reshape(-1).expand(w.shape[0])
+            logical_widths = [w.shape[0]]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
+            )
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = (
+                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+                    .reshape(-1)
+                    .max()
+                )
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        w = weights.get_packed_sharded(
+            f"{prefix}.weight", dim=0, block_sizes=block_sizes
+        )
+
+        if w.dtype == torch.float8_e4m3fn:
+            # FP8 branch
+            scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+
+            if scale.numel() > 1:
+                scale = weights.get_packed_sharded(
+                    f"{prefix}.weight_scale",
+                    dim=0,
+                    block_sizes=block_sizes,
+                    to_dtype=False,
+                )
+            scale = scale.reshape(-1).expand(w.shape[0])
+            logical_widths = [w.shape[0]]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
+            )
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                )
+                if input_scale.numel() > 1:
+                    input_scale = weights.get_packed_sharded(
+                        f"{prefix}.input_scale",
+                        dim=0,
+                        block_sizes=block_sizes,
+                        to_dtype=False,
+                    )
+                input_scale = input_scale.reshape(-1).max()
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [
+            weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
+        ]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = [
+                    weights.get_sharded(f"{p}.weight_scale_inv", dim=0, to_device=False)
+                    for p in prefixes
+                ]
+                scale = torch.cat(scale, dim=dim)
+                scale = scale.to(weights.device)
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
+            scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+            ]
+            scale = torch.cat(scale, dim=0).reshape(-1)
+
+            logical_widths = [x[0] for x in shapes]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
+            )
+
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_multi_weights(self, weights: "Weights", prefixes: List[str], dim: int):
+        # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
+        w = [weights.get_tensor(f"{p}.weight", to_device=False) for p in prefixes]
+        shapes = [x.shape for x in w]
+
+        # Concat then send to the device
+        w = torch.cat(w, dim=dim).to(weights.device)
+
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                scale = [
+                    weights.get_tensor(f"{p}.weight_scale_inv", to_device=False)
+                    for p in prefixes
+                ]
+                scale = torch.cat(scale, dim=dim)
+                scale = scale.to(weights.device)
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
+            scale = [
+                weights.get_tensor(f"{p}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(shape[0])
+                for p, shape in zip(prefixes, shapes)
+            ]
+            scale = torch.cat(scale, dim=0).reshape(-1)
+
+            logical_widths = [x[0] for x in shapes]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
+            )
+
+            input_scale = [
+                weights.get_tensor(f"{p}.input_scale", to_dtype=False).reshape(-1)
+                for p in prefixes
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+    def get_weights_row(self, weights: "Weights", prefix: str):
+        w = weights.get_sharded(f"{prefix}.weight", dim=1)
+        # FP8 branch
+        if w.dtype == torch.float8_e4m3fn:
+            if self.weight_block_size is not None:
+                # XXX: Yes the weights is named scale_inv, but corresponds to scale it seems.
+                scale = weights.get_sharded(f"{prefix}.weight_scale_inv", dim=1)
+
+                return Fp8Weight(
+                    weight=w,
+                    weight_scale=scale,
+                    activation_scale_ub=self.activation_scale_ub,
+                    dtype=weights.dtype,
+                    weight_block_size=self.weight_block_size,
+                )
+
+            scale = (
+                weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
+                .reshape(-1)
+                .expand(w.shape[0])
+            )
+            logical_widths = [w.shape[0]]
+            w, scale = requantize_with_max_scale(
+                w, scale.unsqueeze(-1).to(weights.device), logical_widths, weights.dtype
+            )
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = (
+                    weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
+                    .reshape(-1)
+                    .max()
+                )
+
+            return Fp8Weight(
+                weight=w,
+                weight_scale=scale,
+                input_scale=input_scale,
+                activation_scale_ub=self.activation_scale_ub,
+                dtype=weights.dtype,
+            )
+        if self.to_fp8:
+            return Fp8Weight(weight=w, dtype=weights.dtype)
+
+        return UnquantizedWeight(w)
+
+
+@dataclass
+class Fp8Weight(Weight):
+    weight: torch.Tensor
+    dtype: torch.dtype
+    weight_scale: Optional[torch.Tensor] = None
+    input_scale: Optional[torch.Tensor] = None
+    activation_scale_ub: Optional[float] = None
+    force_w8a16: bool = False
+    weight_block_size: Optional[List[int]] = None
+
+    def get_linear(self, bias: torch.Tensor):
+        if self.weight_scale is None:
+            return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
+                self.weight, bias, self.dtype
+            )
+        # This is not checked by the fbgemm kernels, but they require contiguous
+        # memory. Can be non-contiguous when we e.g. expand from scalars.
+        self.weight_scale = self.weight_scale.contiguous()
+        return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
+            weight=self.weight,
+            scale=self.weight_scale,
+            dtype=self.dtype,
+            bias=bias,
+            input_scale=self.input_scale,
+            scale_upper_bound=self.activation_scale_ub,
+            weight_block_size=self.weight_block_size,
+        )
+
+
+class Fp8Linear(torch.nn.Module):
+    _device_identity_cache = {}
+
+    def __init__(
+        self,
+        qweight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        scale_upper_bound: Optional[float] = None,
+        weight_block_size: Optional[List[int]] = None,
+    ) -> None:
+        super().__init__()
+
+        self.dtype = dtype
+        self.qweight = qweight
+        self.scale = scale.float()
+        self.input_scale = input_scale.float() if input_scale is not None else None
+        self.weight_block_size = weight_block_size
+        self.scale_upper_bound = scale_upper_bound
+
+        self.bias = bias if bias is not None else None
+
+    @classmethod
+    def from_unquant(cls, weight, bias, dtype):
+        qweight, scale = fp8_quantize(weight, scalar=True)
+        return cls(
+            qweight=qweight,
+            scale=scale,
+            dtype=dtype,
+            bias=bias,
+            input_scale=None,
+            scale_upper_bound=None,
+        )
+
+    @classmethod
+    def from_fp8(
+        cls,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> "Fp8Linear":
+        input_scale = kwargs.get("input_scale", None)
+        scale_upper_bound = kwargs.get("scale_upper_bound", None)
+        weight_block_size = kwargs.get("weight_block_size", None)
+
+        if weight_block_size is not None:
+            weight, orig_M, orig_N = pad_block_fp8_weight_naive(
+                weight, scale, weight_block_size
+            )
+            weight, scale = dynamic_quant(
+                dequant_block_fp8_weight_naive(
+                    weight,
+                    scale,
+                    weight_block_size,
+                    original_M=orig_M,
+                    original_N=orig_N,
+                    do_unpad=True,
+                )
+            )
+            scale = scale.squeeze(-1)
+
+        return cls(
+            qweight=weight,
+            scale=scale,
+            input_scale=input_scale,
+            scale_upper_bound=scale_upper_bound,
+            bias=bias,
+            dtype=dtype,
+            weight_block_size=weight_block_size,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.weight_block_size is not None or self.input_scale is None:
+            return apply_block_fp8_linear_hpu_dynamic(
+                input, self.qweight, self.scale, self.input_scale, self.bias
+            )
+
+        x_fp8 = torch.ops.hpu.cast_to_fp8_v2(
+            input, 1.0 / self.input_scale, False, False, torch.float8_e4m3fn
+        )[0]
+        return torch.ops.hpu.fp8_gemm_v2(
+            A=x_fp8,
+            trans_A=False,
+            B=self.qweight,
+            trans_B=True,
+            D=None,
+            out_dtype=input.dtype,
+            A_scale_inv=self.input_scale,
+            B_scale_inv=self.scale,
+            bias=self.bias,
+            accumulate=False,
+        )
+
+
+def _load_scalar_or_matrix_scale(weights: Weights, prefix: str, shape: torch.Size):
+    scale = weights.get_tensor(prefix, to_dtype=False)
+
+    if scale.numel() > 1:
+        scale = weights.get_sharded(prefix, dim=0, to_dtype=False)
+    return scale.reshape(-1).expand(shape[0])
--- a/backends/gaudi/server/text_generation_server/layers/gptq/init.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/init.py
@ -0,0 +1,438 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import torch
+from loguru import logger
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import (
+    Weight,
+    Weights,
+    WeightsLoader,
+    DefaultWeightsLoader,
+)
+
+
+from .hpu import QuantLinear
+
+
+@dataclass
+class GPTQWeight(Weight):
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: Optional[torch.Tensor]
+    bits: int
+    groupsize: int
+    use_awq_kernel: bool
+    use_exllama: bool
+
+    def __post_init__(self):
+        if self.scales.dtype == torch.float:
+            self.scales = self.scales.half()
+
+    @property
+    def device(self) -> torch.device:
+        return self.qweight.device
+
+    def get_linear(self, bias: torch.Tensor):
+        if self.use_awq_kernel:
+            try:
+                from text_generation_server.layers.awq.quantize import WQLinear
+
+                return WQLinear(
+                    w_bit=self.bits,
+                    group_size=self.groupsize,
+                    qweight=self.qweight,
+                    qzeros=self.qzeros,
+                    scales=self.scales,
+                    bias=bias,
+                )
+            except ImportError:
+                raise NotImplementedError(
+                    "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+                )
+        else:
+            return QuantLinear(
+                self.qweight,
+                self.qzeros,
+                self.scales,
+                self.g_idx,
+                bias,
+                self.bits,
+                self.groupsize,
+            )
+
+
+class GPTQWeightsLoader(WeightsLoader):
+    """
+    Loader for GPTQ- and AWQ-quantized weights.
+    """
+
+    def __init__(
+        self,
+        *,
+        bits: int,
+        desc_act: bool,
+        groupsize: int,
+        quant_method: str,
+        quantize: str,
+        sym: bool,
+        modules_to_not_convert: List[str],
+    ):
+        self.bits = bits
+        self.desc_act = desc_act
+        self.groupsize = groupsize
+        self.quant_method = quant_method
+        self.quantize = quantize
+        self.sym = sym
+        self.modules_to_not_convert = modules_to_not_convert
+
+    def is_layer_skipped_quantization(
+        self, prefix: str, modules_to_not_convert: List[str]
+    ):
+        return any(module_name in prefix for module_name in modules_to_not_convert)
+
+    def get_weights(self, weights: Weights, prefix: str):
+        self._get_gptq_params(weights)
+
+        use_exllama = True
+        if self.bits != 4:
+            use_exllama = False
+
+        if self.desc_act:
+            log_once(logger.warning, "Disabling exllama because desc_act=True")
+            use_exllama = False
+
+        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_weights(weights, prefix)
+
+        try:
+            qweight = weights.get_tensor(f"{prefix}.qweight")
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+            )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        else:
+            g_idx = None
+
+        qzeros = weights.get_tensor(f"{prefix}.qzeros")
+        scales = weights.get_tensor(f"{prefix}.scales")
+
+        if use_exllama and g_idx is not None:
+            g_idx = g_idx - g_idx[0]
+
+        if self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_exllama=use_exllama,
+        )
+
+    def get_weights_col_packed(
+        self,
+        weights: Weights,
+        prefix: str,
+        block_sizes: Union[int, List[int]],
+    ):
+        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_weights_col_packed(
+                weights, prefix, block_sizes
+            )
+        try:
+            qweight = weights.get_packed_sharded(
+                f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
+            )
+        scales = weights.get_packed_sharded(
+            f"{prefix}.scales", dim=1, block_sizes=block_sizes
+        )
+        scales = scales.to(dtype=weights.dtype)
+
+        self._get_gptq_params(weights)
+
+        qzeros = weights.get_packed_sharded(
+            f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+        )
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.g_idx")
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            g_idx = (
+                torch.arange(
+                    qweight.shape[0] * (32 // self.bits),
+                    device=qweight.device,
+                )
+                // self.groupsize
+            ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=False,
+        )
+
+    def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
+        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_multi_weights_col(weights, prefixes, dim)
+        try:
+            qweight = torch.cat(
+                [weights.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat(
+            [weights.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+        )
+
+        self._get_gptq_params(weights)
+
+        qzeros = torch.cat(
+            [weights.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
+        )
+
+        use_exllama = self.bits == 4 and self.quantize == "gptq" and not self.desc_act
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def get_multi_weights(self, weights: Weights, prefixes: List[str], dim: int):
+        if self.is_layer_skipped_quantization(prefixes[0], self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_multi_weights(weights, prefixes, dim)
+        try:
+            qweight = torch.cat(
+                [weights.get_tensor(f"{p}.qweight") for p in prefixes], dim=1
+            )
+        except RuntimeError:
+            raise RuntimeError(
+                f"Cannot load `{self.quantize}` weight, make sure the model is already quantized"
+            )
+
+        scales = torch.cat([weights.get_tensor(f"{p}.scales") for p in prefixes], dim=1)
+
+        self._get_gptq_params(weights)
+
+        qzeros = torch.cat([weights.get_tensor(f"{p}.qzeros") for p in prefixes], dim=1)
+
+        use_exllama = self.bits == 4 and self.quantize == "gptq" and not self.desc_act
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
+            for w2 in w[1:]:
+                torch.testing.assert_close(w2, w[0])
+            g_idx = w[0]
+        elif self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                ).to(dtype=torch.int32)
+        else:
+            g_idx = None
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def get_weights_row(self, weights: Weights, prefix: str):
+        self._get_gptq_params(weights)
+
+        use_exllama = True
+        desc_act = self.desc_act
+        if self.bits != 4:
+            use_exllama = False
+
+        if self.is_layer_skipped_quantization(prefix, self.modules_to_not_convert):
+            return DefaultWeightsLoader.get_weights_row(weights, prefix)
+
+        if self.desc_act:
+            log_once(logger.warning, "Disabling exllama because desc_act=True")
+            use_exllama = False
+
+        try:
+            qweight = weights.get_sharded(f"{prefix}.qweight", dim=0)
+        except RuntimeError:
+            raise RuntimeError(
+                "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+            )
+
+        if self.quantize == "gptq" and self.quant_method == "gptq":
+            g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
+        else:
+            g_idx = None
+
+        if weights.process_group.size() > 1:
+            if g_idx is not None:
+                if (
+                    not torch.equal(
+                        # Remove g_idx[0] to adapt the check with TP>1.
+                        (g_idx - g_idx[0]).cpu(),
+                        torch.tensor(
+                            [i // self.groupsize for i in range(g_idx.shape[0])],
+                            dtype=torch.int32,
+                        ),
+                    )
+                    and not (g_idx == 0).all()
+                ):
+                    # Exllama implementation does not support row tensor parallelism with act-order, as
+                    # it would require to reorder input activations that are split unto several GPUs
+                    use_exllama = False
+                    desc_act = True
+
+        from text_generation_server.layers.gptq import (
+            GPTQWeight,
+        )
+
+        if not desc_act and self.groupsize != -1:
+            qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
+            scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+            if g_idx is not None:
+                # qzeros, scales sharded, and g_idx must be adjusted accordingly
+                g_idx = g_idx - g_idx[0]
+        else:
+            qzeros = weights.get_tensor(f"{prefix}.qzeros")
+            scales = weights.get_tensor(f"{prefix}.scales")
+
+        if self.quantize == "gptq" and self.quant_method == "awq":
+            log_once(
+                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+            )
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
+
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+            if use_exllama:
+                g_idx = None
+            else:
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // self.bits),
+                        device=qweight.device,
+                    )
+                    // self.groupsize
+                ).to(dtype=torch.int32)
+
+        return GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=self.bits,
+            groupsize=self.groupsize,
+            use_awq_kernel=self.quantize == "awq",
+            use_exllama=use_exllama,
+        )
+
+    def _get_gptq_params(self, weights: Weights):
+        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
+            self.bits = weights.get_tensor("gptq_bits").item()
+            self.groupsize = weights.get_tensor("gptq_groupsize").item()
+            self.desc_act = False
+            # `server quantize` used asymmetric quantization unconditionally
+            # before the `gptq_sym` setting tensor was added.
+            self.sym = (
+                weights.get_tensor("gptq_sym").item()
+                if weights.has_tensor("gptq_sym")
+                else False
+            )
+            self.quant_method = "gptq"
--- a/backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/hpu.py
@ -0,0 +1,186 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+
+try:
+
+    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
+except Exception as e:
+    hpu_import_exception = e
+
+    def error_raiser_hpu(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use HPU, but could not import the HPU framework with the following error: {hpu_import_exception}"
+        )
+
+    convert_from_uint4 = error_raiser_hpu
+
+
+def pack_tensor(input, bits=4):
+    normal = input.to(torch.int32)
+    q = torch.zeros((normal.shape[0], normal.shape[1] // 32 * bits), dtype=torch.int32)
+    i = 0
+    col = 0
+    while col < q.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            q[:, col] |= normal[:, j] << (bits * (j - i))
+        i += 32 // bits
+        col += 1
+    q = q.to(torch.int32)
+    return q
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+        self.wf = torch.tensor(
+            list(range(0, 32, self.bits)), dtype=torch.int32
+        ).unsqueeze(0)
+        self._preprocessing()
+
+    def unpack_zeros_from_cuda_old_format(self):
+        zeros = torch.bitwise_right_shift(
+            torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+            self.wf.unsqueeze(0),
+        ).to(torch.int16 if self.bits == 8 else torch.int8)
+
+        zeros = zeros + 1
+        zeros = torch.bitwise_and(zeros, (2**self.bits) - 1).to(
+            self.scales.dtype
+        )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+        zeros = zeros.reshape(-1, zeros.shape[1] * zeros.shape[2])
+        return zeros
+
+    def unpack_weight_from_cuda_old_format(self):
+        weight = torch.bitwise_right_shift(
+            torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+            self.wf.unsqueeze(-1),
+        ).to(torch.int16 if self.bits == 8 else torch.int8)
+        weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+        weight = weight.reshape((weight.shape[0] * weight.shape[1], weight.shape[2]))
+        return weight
+
+    def _preprocessing(self):
+        orig_device = self.qweight.device
+        self.qweight = self.qweight.cpu()
+        weight = self.unpack_weight_from_cuda_old_format()
+        new_qweight = pack_tensor(weight)
+        self.qweight = new_qweight.to(orig_device)
+        # TODO: Support group indexing and remove the check
+        columns = self.qweight.shape[0]
+        g_idx_trivial = [i // self.groupsize for i in range(columns)]
+        g_idx_trivial = torch.tensor(
+            g_idx_trivial, dtype=torch.int32, device=self.g_idx.device
+        )
+        assert torch.equal(
+            self.g_idx, g_idx_trivial
+        ), "Non-trivial tensor g_idx is not supported"
+        self.qzeros = self.qzeros.cpu()
+        zeros = self.unpack_zeros_from_cuda_old_format()
+        new_qzeros = pack_tensor(zeros)
+        self.qzeros = new_qzeros.to(orig_device)
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        x = x.reshape(-1, x.shape[-1])
+        weight = convert_from_uint4(self.qweight, self.scales, self.qzeros, x.dtype)
+        out = torch.matmul(x, weight)
+        out = out.reshape(out_shape)
+        out = out + self.bias if self.bias is not None else out
+        return out
--- a/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/quantize.py
@ -12,9 +12,12 @@ from huggingface_hub import HfApi
 from accelerate import init_empty_weights
 from text_generation_server.utils import initialize_torch_distributed, Weights
 from text_generation_server.utils.hub import weight_files
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
+from text_generation_server.layers.gptq import QuantLinear
 from loguru import logger
 from typing import Optional
+from text_generation_server.layers.gptq.utils import torch_snr_error
+
+from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight

 DEV = torch.device("cuda:0")

@ -370,7 +373,7 @@ def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
-    except:
+    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )
@ -402,7 +405,7 @@ def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
-    except:
+    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )
@ -446,7 +449,7 @@ def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
-    except:
+    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )
@ -502,7 +505,7 @@ def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
-    except:
+    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )
@ -544,7 +547,7 @@ def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=False, trust_remote_code=trust_remote_code
        )
-    except:
+    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(
            model_id, use_fast=True, trust_remote_code=trust_remote_code
        )
@ -698,6 +701,8 @@ def sequential(
                pass

            def add_batch(name):
+                nonlocal gptq
+
                def tmp(_, inp, out):
                    gptq[name].add_batch(inp[0].data, out.data)

@ -869,6 +874,7 @@ def quantize(
    upload_to_model_id: Optional[str],
    percdamp: float,
    act_order: bool,
+    sym: bool,
 ):
    print("loading model")
    config = AutoConfig.from_pretrained(
@ -891,6 +897,7 @@ def quantize(
        dtype=torch.float16,
        process_group=process_group,
        aliases={"embed_tokens.weight": ["lm_head.weight"]},
+        weights_loader=DefaultWeightsLoader(UnquantizedWeight),
    )
    hooks = []
    for name, module in model.named_modules():
@ -943,22 +950,30 @@ def quantize(
        percdamp=percdamp,
        act_order=act_order,
        hooks=hooks,
+        sym=sym,
    )
    print(time.time() - tick)

    pack(model, quantizers, bits, groupsize)
    from safetensors.torch import save_file
-    from transformers.modeling_utils import shard_checkpoint
+    from huggingface_hub import split_torch_state_dict_into_shards

    state_dict = model.state_dict()
    state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
-    state_dict["gptq_bits"] = torch.LongTensor([bits])
-    state_dict["gptq_groupsize"] = torch.LongTensor([groupsize])

    max_shard_size = "10GB"
-    shards, index = shard_checkpoint(
-        state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors"
+    state_dict_split = split_torch_state_dict_into_shards(
+        state_dict,
+        filename_pattern="model.safetensors",
+        max_shard_size=max_shard_size,
    )
+    index = None
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+    shards = state_dict_split.filename_to_tensors
    os.makedirs(output_dir, exist_ok=True)
    for shard_file, shard in shards.items():
        save_file(
@ -985,6 +1000,15 @@ def quantize(
            f"index located at {save_index_file}."
        )
    config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
+    config.quantization_config = {
+        "bits": bits,
+        "group_size": groupsize,
+        "damp_percent": percdamp,
+        "desc_act": act_order,
+        "static_groups": False,
+        "sym": sym,
+        "quant_method": "gptq",
+    }
    config.save_pretrained(output_dir)
    logger.info("Saved config")
    logger.info("Saving tokenizer")
--- a/backends/gaudi/server/text_generation_server/layers/gptq/utils.py
+++ b/backends/gaudi/server/text_generation_server/layers/gptq/utils.py
@ -0,0 +1,56 @@
+import torch
+
+
+# copied from https://github.com/openppl-public/ppq/blob/master/ppq/quantization/measure/norm.py
+def torch_snr_error(
+    y_pred: torch.Tensor, y_real: torch.Tensor, reduction: str = "mean"
+) -> torch.Tensor:
+    """
+    Compute SNR between y_pred(tensor) and y_real(tensor)
+
+    SNR can be calcualted as following equation:
+
+        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
+
+    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
+
+        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
+
+    Args:
+        y_pred (torch.Tensor): _description_
+        y_real (torch.Tensor): _description_
+        reduction (str, optional): _description_. Defaults to 'mean'.
+
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+
+    Returns:
+        torch.Tensor: _description_
+    """
+    if y_pred.shape != y_real.shape:
+        raise ValueError(
+            f"Can not compute snr loss for tensors with different shape. "
+            f"({y_pred.shape} and {y_real.shape})"
+        )
+    reduction = str(reduction).lower()
+
+    if y_pred.ndim == 1:
+        y_pred = y_pred.unsqueeze(0)
+        y_real = y_real.unsqueeze(0)
+
+    y_pred = y_pred.flatten(start_dim=1)
+    y_real = y_real.flatten(start_dim=1)
+
+    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    signal_power = torch.pow(y_real, 2).sum(dim=-1)
+    snr = (noise_power) / (signal_power + 1e-7)
+
+    if reduction == "mean":
+        return torch.mean(snr)
+    elif reduction == "sum":
+        return torch.sum(snr)
+    elif reduction == "none":
+        return snr
+    else:
+        raise ValueError("Unsupported reduction method.")
--- a/backends/gaudi/server/text_generation_server/layers/layernorm.py
+++ b/backends/gaudi/server/text_generation_server/layers/layernorm.py
@ -0,0 +1,62 @@
+import torch
+from torch import nn
+from accelerate import init_empty_weights
+
+
+# Monkey patching
+@classmethod
+def load_layer_norm(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = torch.nn.Parameter(bias)
+    return ln
+
+
+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = None
+    return ln
+
+
+torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
+
+
+class FastLayerNorm(nn.LayerNorm):
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+
+        return super().forward(hidden_states), residual
+
+
+class FastRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float):
+        super().__init__()
+
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, prefix, weights, eps=1e-6):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        return cls(weight, eps)
+
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(self.weight.dtype), residual
--- a/backends/gaudi/server/text_generation_server/layers/linear.py
+++ b/backends/gaudi/server/text_generation_server/layers/linear.py
@ -0,0 +1,38 @@
+import torch
+from torch.nn import functional as F
+
+
+class FastLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.bias = None
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+
+def get_linear(weight, bias):
+    # Weights that are loaded through methods that are not
+    # quantization-aware are still bare tensors. We may want
+    # to change this in the future.
+    if isinstance(weight, torch.Tensor):
+        return FastLinear(weight, bias)
+
+    return weight.get_linear(bias)
--- a/backends/gaudi/server/text_generation_server/layers/lora.py
+++ b/backends/gaudi/server/text_generation_server/layers/lora.py
@ -0,0 +1,279 @@
+from typing import TYPE_CHECKING, Optional, List
+
+import torch
+import torch.distributed
+from torch import nn
+from torch.distributed import ProcessGroup
+
+from text_generation_server.utils.sgmv import (
+    add_lora_a_bgmv,
+    add_lora_b_bgmv,
+    has_sgmv,
+    lora_a_sgmv_cutlass,
+    lora_b_sgmv_cutlass,
+    orient_for_rank,
+)
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters import AdapterBatchData
+    from text_generation_server.adapters.lora import BatchLoraWeights
+
+
+class LoraLinear(nn.Module):
+    def __init__(
+        self, base_layer: nn.Module, layer_id: int, process_group: ProcessGroup
+    ):
+        super().__init__()
+        self.base_layer = base_layer
+        self.layer_id = layer_id
+        self.process_group = process_group
+
+    def forward_layer_type(
+        self,
+        result: torch.Tensor,
+        input: torch.Tensor,
+        adapter_data: "AdapterBatchData",
+        layer_type: str,
+        start_idx: int,
+        end_idx: int,
+    ) -> torch.Tensor:
+        if adapter_data is None:
+            return result
+        data: Optional["BatchLoraWeights"] = adapter_data.data.get(layer_type)
+
+        if has_sgmv() and data is not None and data.can_vectorize(self.process_group):
+            # In tensor-parallel configurations, each GPU processes a specific segment of the output.
+            # The 'result' tensor represents the full output, which can vary in size based on
+            # the layer type (e.g., attention vs. feed-forward layers). We define the current
+            # segment using start_idx and end_idx. If the segment size doesn't match this GPU's
+            # slice of 'result', we create a zero tensor of the correct size for LoRA computation.
+            # This approach ensures accurate LoRA application across various layer sizes and
+            # configurations, adapting to different model architectures and parallelization strategies.
+            #
+            # Example scenarios where this is necessary:
+            # 1. The adapter's size doesn't evenly divide across GPUs.
+            # 2. We're processing the last segment which might be smaller.
+            # 3. Different projection layers (q, k, v) have different sizes.
+            if end_idx - start_idx != result.shape[1]:
+                proj = torch.zeros_like(result[:, start_idx:end_idx])
+            else:
+                proj = result
+
+            for r, rank_segments in data.rank_data.items():
+                lora_a_ptr = rank_segments.lora_a_ptr
+                lora_b_ptr = rank_segments.lora_b_ptr
+
+                if lora_a_ptr is None or lora_b_ptr is None:
+                    raise ValueError("LoRA data is missing")
+
+                if data.use_sgmv:
+                    # Use SGMV for prefill
+                    v = lora_a_sgmv_cutlass(
+                        input,
+                        rank_segments.tmp_shrink,
+                        lora_a_ptr,
+                        rank_segments.segment_starts,
+                        rank_segments.segment_ends,
+                        self.layer_id,
+                        r,
+                    )
+
+                    if self.process_group.size() > 1:
+                        v = self.collect_lora_a(v)
+
+                    lora_b_sgmv_cutlass(
+                        proj,
+                        v,
+                        rank_segments.tmp_expand,
+                        lora_b_ptr,
+                        rank_segments.segment_starts,
+                        rank_segments.segment_ends,
+                        self.layer_id,
+                    )
+                else:
+                    # Use BGMV for decode
+                    v = torch.zeros(
+                        (input.size(0), r), dtype=input.dtype, device=input.device
+                    )
+                    # TODO: error with [-1, 0], but not [0, -1]
+                    add_lora_a_bgmv(
+                        v,
+                        input,
+                        lora_a_ptr,
+                        rank_segments.indices,
+                        self.layer_id,
+                    )
+
+                    if self.process_group.size() > 1:
+                        v = self.collect_lora_a(v)
+
+                    add_lora_b_bgmv(
+                        proj,
+                        v,
+                        lora_b_ptr,
+                        rank_segments.indices,
+                        self.layer_id,
+                    )
+
+            if end_idx - start_idx != result.shape[1]:
+                result[:, start_idx:end_idx] += proj
+        else:
+            for adapter_index in adapter_data.meta.adapter_set:
+                if data is not None and data.has_adapter(adapter_index):
+                    adapter_mask = (
+                        (adapter_data.meta.adapter_indices == adapter_index)
+                        .to(input.dtype)
+                        .view(-1, 1)
+                    )
+                    layer_result = self.forward_lora(
+                        input, data, adapter_index, adapter_mask
+                    )
+                    result[:, start_idx:end_idx] += layer_result
+
+        return result
+
+    def forward_lora(
+        self,
+        input: torch.Tensor,
+        data: "BatchLoraWeights",
+        adapter_index: int,
+        adapter_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        lora_a = data.lora_a[adapter_index][self.layer_id, :, :]
+        lora_b = data.lora_b[adapter_index][self.layer_id, :, :]
+
+        lora_a = orient_for_rank(lora_a, lora_b.size(0))
+
+        a_out = input @ lora_a
+        if self.process_group.size() > 1:
+            a_out = self.collect_lora_a(a_out)
+
+        result = (a_out @ lora_b) * adapter_mask
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError("Implemented in subclasses")
+
+
+class TensorParallelMultiAdapterLinear(LoraLinear):
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        layer_id: int,
+        layer_names: List[str],
+        sizes: List[int],
+        process_group: ProcessGroup,
+    ):
+        super().__init__(base_layer, layer_id, process_group)
+        self.layer_names = layer_names
+        self.sizes = sizes
+
+    @classmethod
+    def load(
+        cls,
+        base_layer: nn.Module,
+        layer_id: int,
+        layer_names: List[str],
+        sizes: List[int],
+        process_group: ProcessGroup,
+    ):
+        return TensorParallelMultiAdapterLinear(
+            base_layer, layer_id, layer_names, sizes, process_group
+        )
+
+    def forward(
+        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
+    ) -> torch.Tensor:
+        result = self.base_layer(input)
+
+        # noop if no layer names are provided (e.g. for models without adapters)
+        if self.layer_names is None:
+            return result
+
+        # handle models like Bloom that have inputs of shape
+        # (batch_size, sequence_length, hidden_size)
+        # we need to reshape them to (batch_size * sequence_length, hidden_size)
+        # for the LoRA computation, then reshape back
+        prev_shape = result.shape
+        is_3d = len(input.shape) >= 3
+        if is_3d:
+            input = input.reshape(-1, input.shape[-1])
+            result = result.reshape(-1, result.shape[-1])
+
+        offset = 0
+        for i, layer_name in enumerate(self.layer_names):
+            start_idx = offset // self.process_group.size()
+            # The 'sizes' parameter is essential in tensor-parallel setups for handling multiple
+            # projection layers (q_proj, k_proj, v_proj) by defining their output dimensions. It
+            # ensures correct slicing of the result tensor, accommodating variations like grouped-query
+            # attention where k_proj and v_proj differ from q_proj. This allows precise application of
+            # LoRA adapters to each sub-component of the multi-head attention mechanism, managing the
+            # different projection sizes across layers and model architectures.
+            if self.sizes is not None:
+                offset += self.sizes[i]
+                end_idx = offset // self.process_group.size()
+            else:
+                end_idx = result.shape[1]
+
+            result = self.forward_layer_type(
+                result, input, adapter_data, layer_name, start_idx, end_idx
+            )
+
+        if is_3d:
+            result = result.reshape(prev_shape)
+
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        # Tensor parallel implementation of X @ A@B, where A and B are sharded column-wise.
+        # We use an all-gather between X@A and (X@A)@B to ensure alignment across ranks.
+        #
+        # TODO(travis): this is not very efficient as we do an all-gather for every adapter,
+        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
+        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
+        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
+        gathered_tensors = [
+            torch.empty_like(a_out) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(gathered_tensors, a_out)
+        return torch.cat(gathered_tensors, dim=1)
+
+
+class TensorParallelAdapterRowLinear(LoraLinear):
+    def __init__(self, base_layer, layer_id, layer_name, process_group):
+        super().__init__(base_layer, layer_id, process_group)
+        self.layer_name = layer_name
+
+    @classmethod
+    def load(cls, base_layer, layer_id, layer_name, process_group):
+        return cls(base_layer, layer_id, layer_name, process_group)
+
+    def forward(
+        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
+    ) -> torch.Tensor:
+        result = self.base_layer(input)
+
+        if self.layer_name is None:
+            return result
+
+        # Fused all-gather + all-reduce from S-LoRA paper: https://arxiv.org/abs/2311.03285
+        stride = result.shape[-1] // self.process_group.size()
+        start_idx = self.process_group.rank() * stride
+        end_idx = (self.process_group.rank() + 1) * stride
+
+        self.forward_layer_type(
+            result, input, adapter_data, self.layer_name, start_idx, end_idx
+        )
+
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        # Tensor parallel implementation of X @ A@B, where A and B are sharded row-wise.
+        # We use an all-reduce between X@A and (X@A)@B to ensure alignment across ranks.
+        #
+        # TODO(travis): this is not very efficient as we do an all-reduce for every adapter,
+        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
+        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
+        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
+        torch.distributed.all_reduce(a_out, group=self.process_group)
+        return a_out
--- a/backends/gaudi/server/text_generation_server/layers/medusa.py
+++ b/backends/gaudi/server/text_generation_server/layers/medusa.py
@ -0,0 +1,191 @@
+import torch
+from torch import nn
+from typing import Tuple, Optional
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.layers.linear import FastLinear
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelHead,
+    TensorParallelColumnLinear,
+)
+
+
+class ResBlock(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.linear = FastLinear.load(
+            config, prefix=f"{prefix}.linear", weights=weights, bias=True
+        )
+        self.act = torch.nn.SiLU()
+
+    def forward(self, x):
+        return x + self.act(self.linear(x))
+
+
+class MedusaModel(torch.nn.Module):
+    def __init__(self, config, medusa_config, weights):
+        super().__init__()
+        self.heads = torch.nn.ModuleList(
+            [
+                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
+                for i in range(get_speculate())
+            ]
+        )
+
+    def forward(self, x):
+        if not self.heads:
+            return None
+        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
+        return speculative_logits
+
+
+class MedusaHead(torch.nn.Module):
+    def __init__(self, config, medusa_config, prefix, weights):
+        super().__init__()
+        self.blocks = torch.nn.ModuleList(
+            [
+                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
+                for i in range(medusa_config["medusa_num_layers"])
+            ]
+        )
+        n = len(self.blocks)
+        self.out = FastLinear.load(
+            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
+        )
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.out(x)
+        return x
+
+
+class MedusaHeadV1(nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.lm_head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        speculator = config.speculator
+
+        path = speculator["path"]
+        medusa_config = str(Path(path) / "config.json")
+
+        for fname in speculator["model_paths"]:
+            filename = str(Path(path) / fname)
+
+            with open(medusa_config, "r") as f:
+                medusa_config = json.load(f)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        medusa = MedusaModel(config, medusa_config, weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MedusaHeadV1(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        speculative_logits = self.medusa(input)
+        return logits, speculative_logits
+
+
+class MedusaHeadV2(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        speculator_path = config.speculator["path"]
+
+        medusa_config = str(Path(speculator_path) / "config.json")
+        filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        self.n_medusa_heads = get_speculate()
+
+        assert medusa_config["medusa_num_layers"] == 1
+        self.linear = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.process_group = weights.process_group
+        self.world_size = self.process_group.size()
+        self.rank = self.process_group.rank()
+
+        self.act = torch.nn.SiLU()
+
+        self.lm_head = TensorParallelHead.load(config, prefix, weights)
+
+    def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
+        size = x.shape[-1]
+        block_size = (size + self.world_size - 1) // self.world_size
+        start = self.rank * block_size
+        stop = (self.rank + 1) * block_size
+
+        x_block = x[:, start:stop]
+
+        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
+        medusa_res = self.act(self.linear(x)).reshape(
+            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
+        )
+
+        # Apply all residual medusa heads
+        output = x[:, start:stop].unsqueeze(-2) + medusa_res
+
+        # Gather medusa heads
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+
+        # Stack x and medusa residual x
+        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
+
+        # Compute lm head on x + medusa residual x
+        logits = self.lm_head(stacked_x)
+
+        # Finally, split logits from speculative logits
+        logits, speculative_logits = torch.split(
+            logits, [1, self.n_medusa_heads], dim=-2
+        )
+        # Squeeze added dimension
+        logits = logits.squeeze(-2)
+
+        return logits, speculative_logits
--- a/Show More
+++ b/Show More