Merge branch 'huggingface:main' into main

2025-09-12 12:54:52 +00:00 · 2024-07-08 14:45:13 +08:00 · 2024-07-08 14:45:13 +08:00 · 28cecb663a
commit 28cecb663a
parent 0c7559b7fb 05c094fcfa
439 changed files with 136327 additions and 9649 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -5,14 +5,14 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: | 
+      description: |
        Please share your system info with us (`text-generation-launcher --env` if installed locally).
-        The full command line used that causes issues: 
+        The full command line used that causes issues:
        OS version:
        Rust version (if self-compiling, `cargo version`):
        Model being used (`curl 127.0.0.1:8080/info | jq`):
          If local model please explicit the kind of model and/or equivalents.
-        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
        Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
        The current version being used:

@ -52,11 +52,11 @@ body:

      placeholder: |
        Steps to reproduce the behavior:
-          
+
          1.
          2.
          3.
-          
+

  - type: textarea
    id: expected-behavior
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -19,7 +19,7 @@ body:
      label: Motivation
      description: |
        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+

  - type: textarea
    id: contribution
--- a/.github/workflows/autodocs.yaml
+++ b/.github/workflows/autodocs.yaml
@ -0,0 +1,40 @@
+name: Automatic Documentation for Launcher
+
+on:
+  pull_request:
+
+jobs:
+  update_docs:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        profile: minimal
+        toolchain: stable
+
+    - name: Install Protocol Buffers compiler
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y protobuf-compiler libprotobuf-dev
+
+    - name: Install Launcher
+      id: install-launcher
+      run: cargo install --path launcher/
+
+    - name: Install router
+      id: install-router
+      run: cargo install --path router/
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Check that documentation is up-to-date
+      run: |
+        python update_doc.py --check
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -1,68 +1,34 @@
 name: Build and push docker image to internal registry

 on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'main'
-    tags:
-      - 'v*'
-  pull_request:
-    paths:
-      - ".github/workflows/build.yaml"
-      - "integration-tests/**"
-      - "server/**"
-      - "proto/**"
-      - "router/**"
-      - "launcher/**"
-      - "Cargo.lock"
-      - "rust-toolchain.toml"
-      - "Dockerfile"
-    branches:
-      - 'main'
+  workflow_call:
+    inputs:
+      hardware:
+        type: string
+        description: Hardware
+          # options:
+          # - cuda
+          # - rocm
+          # - intel
+        required: true
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean

 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ami-03cfed9ea28f4b002
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
-      EC2_SECURITY_GROUP: sg-030175c435ac141d6
+  build-and-push:
    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
-  build-and-push-image:
+      docker_image: ${{ steps.final.outputs.docker_image }}
+      docker_devices: ${{ steps.final.outputs.docker_devices }}
+      runs_on: ${{ steps.final.outputs.runs_on }}
+      label: ${{ steps.final.outputs.label }}
    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    # TODO see with @Glegendre to get CPU runner here instead
+    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
    permissions:
      contents: write
      packages: write
@ -72,38 +38,60 @@ jobs:
      security-events: write
    steps:
      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Initialize Docker Buildx
-        uses: docker/setup-buildx-action@v2.0.0
-        with:
-          install: true
+        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
-      - name: Install cosign
-        if: github.event_name != 'pull_request'
-        uses: sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6 #v2.6.0
+      - name: Construct harware variables
+        shell: bash
+        run: |
+          case ${{ inputs.hardware }} in
+            cuda)
+                export dockerfile="Dockerfile"
+                export label_extension=""
+                export docker_devices=""
+                export runs_on="nvidia-gpu"
+                ;;
+            rocm)
+                export dockerfile="Dockerfile_amd"
+                export label_extension="-rocm"
+                export docker_devices="/dev/kfd,/dev/dri"
+                # TODO Re-enable when they pass.
+                # export runs_on="amd-gpu-tgi"
+                export runs_on="ubuntu-latest"
+                ;;
+            intel)
+                export dockerfile="Dockerfile_intel"
+                export label_extension="-intel"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                ;;
+          esac
+          echo $dockerfile
+          echo "Dockerfile=${dockerfile}"
+          echo $label_extension
+          echo $docker_devices
+          echo $runs_on
+          echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
+          echo "LABEL=${label_extension}" >> $GITHUB_ENV
+          echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
+          echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
+      - name: Initialize Docker Buildx
+        uses: docker/setup-buildx-action@v3
        with:
-          cosign-release: 'v1.13.1'
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+          install: true
+          config-inline: |
+            [registry."docker.io"]
+              mirrors = ["registry.github-runners.huggingface.tech"]
      - name: Login to GitHub Container Registry
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Login to internal Container Registry
-        uses: docker/login-action@v2.1.0
-        with:
-          username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
-          password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
-          registry: registry.internal.huggingface.tech
      - name: Login to Azure Container Registry
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2.1.0
+        uses: docker/login-action@v3
        with:
          username: ${{ secrets.AZURE_DOCKER_USERNAME }}
          password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
@ -112,12 +100,12 @@ jobs:
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name == 'pull_request' }}
        id: meta-pr
-        uses: docker/metadata-action@v4.3.0
+        uses: docker/metadata-action@v5
        with:
          images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
          tags: |
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
      # If main, release or tag
      - name: Extract metadata (tags, labels) for Docker
        if: ${{ github.event_name != 'pull_request' }}
@ -127,120 +115,62 @@ jobs:
          flavor: |
            latest=auto
          images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
            ghcr.io/huggingface/text-generation-inference
            db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
          tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+            type=semver,pattern={{version}}${{ env.LABEL }}
+            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
+            type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
      - name: Build and push Docker image
        id: build-and-push
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: Dockerfile
+          file: ${{ env.DOCKERFILE }}
          push: true
          platforms: 'linux/amd64'
          build-args: |
            GIT_SHA=${{ env.GITHUB_SHA }}
-            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
-          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
-      # Sign the resulting Docker image digest except on PRs.
-      # This will only write to the public Rekor transparency log when the Docker
-      # repository is public to avoid leaking data.
-      - name: Sign the published Docker image
-        if: ${{ github.event_name != 'pull_request' }}
-        env:
-          COSIGN_EXPERIMENTAL: "true"
-        # This step uses the identity token to provision an ephemeral certificate
-        # against the sigstore community Fulcio instance.
-        run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign {}@${{ steps.build-and-push.outputs.digest }}
-      - name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph
-        uses: aquasecurity/trivy-action@master
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          image-ref: 'ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}'
-          format: 'github'
-          output: 'dependency-results.sbom.json'
-          github-pat: ${{ secrets.GITHUB_TOKEN }}
-          scanners: 'vuln'
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@master
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          image-ref: 'ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}'
-          format: 'sarif'
-          output: 'trivy-results.sarif'
-          severity: 'CRITICAL'
-          scanners: 'vuln'
-      - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@v2
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          sarif_file: 'trivy-results.sarif'
-
-  integration-tests:
+          cache-from: type=registry,ref=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min
+          cache-to: type=registry,ref=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min
+      - name: Final
+        id: final
+        run: |
+          echo "docker_image=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
+          echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
+          echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+  integration_tests:
    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs:
-      - start-runner
-      - build-and-push-image # Wait for the docker image to be built
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    needs: build-and-push
+    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
+    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
    env:
-      DOCKER_VOLUME: /cache
+      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}
    steps:
-      - uses: actions/checkout@v2
+      - name: Checkout repository
+        uses: actions/checkout@v4
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: 3.9
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
+          python-version: "3.10"
      - name: Install
        run: |
          make install-integration-tests
      - name: Run tests
        run: |
-          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv integration-tests
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - build-and-push-image
-      - integration-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+          export DOCKER_VOLUME=/mnt/cache
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          echo $DOCKER_IMAGE
+          pytest -s -vv integration-tests ${PYTEST_FLAGS}
--- a/.github/workflows/build_documentation.yaml
+++ b/.github/workflows/build_documentation.yaml
@ -17,5 +17,4 @@ jobs:
      package: text-generation-inference
      additional_args: --not_python_module
    secrets:
-      token: ${{ secrets.HUGGINGFACE_PUSH }}
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yaml
+++ b/.github/workflows/build_pr_documentation.yaml
@ -11,9 +11,9 @@ concurrency:

 jobs:
  build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main
    with:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: text-generation-inference
-      additional_args: --not_python_module 
+      additional_args: --not_python_module
--- a/.github/workflows/ci_build.yaml
+++ b/.github/workflows/ci_build.yaml
@ -0,0 +1,45 @@
+name: CI build
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+  pull_request:
+    paths:
+      - ".github/workflows/build.yaml"
+      - "integration-tests/**"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - "Dockerfile"
+      - "Dockerfile_amd"
+      - "Dockerfile_intel"
+    branches:
+      - "main"
+  workflow_dispatch:
+    inputs:
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean
+
+jobs:
+  build:
+    strategy:
+      # super important if you want to see all results, even if one fails
+      # fail-fast is true by default
+      fail-fast: false
+      matrix:
+        hardware: ["cuda", "rocm", "intel"]
+    uses: ./.github/workflows/build.yaml # calls the one above ^
+    with:
+      hardware: ${{ matrix.hardware }}
+      # https://github.com/actions/runner/issues/2206
+      release-tests: ${{ inputs.release-tests == true }}
+    secrets: inherit
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@ -22,4 +22,5 @@ jobs:
      - name: Run tests
        run: |
          pip install pytest pytest-asyncio
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          make python-client-tests
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@ -1,12 +0,0 @@
-name: Delete doc comment
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
--- a/.github/workflows/integration_tests.yaml
+++ b/.github/workflows/integration_tests.yaml
@ -0,0 +1,41 @@
+name: Integration tests
+
+on:
+  workflow_call:
+    inputs:
+      docker_image:
+        type: string
+        description: Hardware
+        required: true
+      docker_devices:
+        type: string
+        description: Hardware
+      runs_on:
+        type: string
+        required: true
+        description: Hardware to run integration tests
+jobs:
+  integration_tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on: ${{ inputs.runs_on }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_VOLUME=/mnt/cache
+          export DOCKER_IMAGE=${{ inputs.docker_image }}
+          export DOCKER_DEVICES=${{ inputs.docker_devices }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          pytest -s -vv integration-tests
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@ -11,66 +11,24 @@ on:
      - 'main'

 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-      EC2_AMI_ID: ami-0ab09c07cfd194259
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
-      EC2_SECURITY_GROUP: sg-072f92ae3082936c6
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
  load-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
    env:
      DOCKER_VOLUME: /cache
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3

-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
-
      - name: Install k6
        run: |
          curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1

      - name: Start starcoder
        run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
          sleep 10
          wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health

@ -82,27 +40,3 @@ jobs:
        if: ${{ always() }}
        run: |
          docker stop tgi-starcoder || true
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - load-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -33,11 +33,17 @@ jobs:
      - name: Install Rust
        uses: actions-rs/toolchain@v1
        with:
-          toolchain: 1.71.0
+          # Released on: 02 May, 2024
+          # https://releases.rs/docs/1.78.0/
+          toolchain: 1.79.0
          override: true
          components: rustfmt, clippy
      - name: Install Protoc
        uses: arduino/setup-protoc@v1
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
      - name: Install sccache
        run: |
          curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
@ -62,18 +68,17 @@ jobs:
            ~/.cargo/git
      - name: Install
        run: |
-          make install
+          make install-cpu
      - name: Run server tests
        run: |
          pip install pytest
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
          pytest -s -vv server/tests
-      - name: Run Rust fmt
+      - name: Pre-commit checks
        run: |
-          cargo fmt --check
-      - name: Run Rust clippy
-        run: |
-          cargo clippy
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
      - name: Run Rust tests
        run: |
          cargo test
--- a/.github/workflows/trufflehog.yaml
+++ b/.github/workflows/trufflehog.yaml
@ -0,0 +1,18 @@
+on:
+  push:
+
+name: Secret Leaks
+
+permissions:
+  contents: read
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main
--- a/.github/workflows/upload_pr_documentation.yaml
+++ b/.github/workflows/upload_pr_documentation.yaml
@ -13,4 +13,4 @@ jobs:
      package_name: text-generation-inference
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,15 @@
 target
 router/tokenizer.json
 *__pycache__*
+
+# ROCm auto-generated files
+*.hip
+server/exllamav2_kernels/exllamav2_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip_func/
+*_hip.cuh
+server/exllama_kernels/exllama_kernels/hip_buffers.cuh
+server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
+
+data/
+load_tests/*.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,18 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        exclude: docs/source/basic_tutorials/launcher.md
+-   repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/doublify/pre-commit-rust
+    rev: v1.0
+    hooks:
+    -   id: fmt
+    -   id: cargo-check
+    -   id: clippy
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,133 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,120 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Contribute to text-generation-inference
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+contributions are not the only way to help the community. Answering questions, helping
+others, and improving the documentation are also immensely valuable.
+
+It also helps us if you spread the word! Reference the library in blog posts
+about the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply ⭐️ the repository to say thank you.
+
+However you choose to contribute, please be mindful and respect our
+[code of conduct](https://github.com/huggingface/text-generation-inference/blob/main/CODE_OF_CONDUCT.md).
+
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
+
+## Ways to contribute
+
+There are several ways you can contribute to text-generation-inference.
+
+* Fix outstanding issues with the existing code.
+* Submit issues related to bugs or desired new features.
+* Contribute to the examples or to the documentation.
+
+> All contributions are equally valuable to the community. 🥰
+
+## Fixing outstanding issues
+
+If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open
+a Pull Request!
+
+## Submitting a bug-related issue or feature request
+
+Do your best to follow these guidelines when submitting a bug-related issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter.
+
+Before you report an issue, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the
+library itself, and not your code.
+
+Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so
+we can quickly resolve it:
+
+* Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies).
+* A short, self-contained, code snippet that allows us to reproduce the bug.
+* The *full* traceback if an exception is raised.
+* Attach any other additional information, like screenshots, you think may help.
+
+To get the OS and software versions automatically, you can re-run the launcher with the `--env` flag:
+
+```bash
+text-generation-launcher --env
+```
+
+This will precede the launch of the model with the information relative to your environment. We recommend pasting
+that in your issue report.
+
+### Do you want a new feature?
+
+If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe:
+
+1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it
+   a feature related to something you need for a project? Is it something you worked on and think it could benefit
+   the community?
+
+   Whatever it is, we'd love to hear about it!
+
+2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better
+   we'll be able to help you.
+3. Provide a *code snippet* that demonstrates the feature's usage.
+4. If the feature is related to a paper, please include a link.
+
+If your issue is well written we're already 80% of the way there by the time you create it.
+
+We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE)
+to help you get started with your issue.
+
+## Do you want to implement a new model?
+
+New models are constantly released and if you want to implement a new model, please provide the following information:
+
+* A short description of the model and a link to the paper.
+* Link to the implementation if it is open-sourced.
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can help you add it to text-generation-inference!
+
+## Do you want to add documentation?
+
+We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know
+how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be
+happy to make the changes or help you make a contribution if you're interested!
+
+## I want to become a maintainer of the project. How do I get there?
+
+TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have
+motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference
+service.
+
+If you are such an individual (or organization), please reach out to us and let's collaborate.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -6,15 +6,32 @@ members = [
    "router/grpc-metadata",
    "launcher"
 ]
+resolver = "2"

 [workspace.package]
-version = "1.0.3"
+version = "2.1.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"

+[workspace.dependencies]
+base64 = "0.22.0"
+tokenizers = { version = "0.19.1", features = ["http"] }
+hf-hub = { version = "0.3.1", features = ["tokio"] }
+
 [profile.release]
+incremental = true
+
+[profile.release-binary]
+inherits = "release"
 debug = 1
 incremental = true
-lto = "off"
 panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
+lto = "fat"
+opt-level = 3
+codegen-units = 1
--- a/161
+++ b/161
@ -1,10 +1,11 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src

 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

-FROM chef as planner
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
@ -15,9 +16,6 @@ RUN cargo chef prepare --recipe-path recipe.json

 FROM chef AS builder

-ARG GIT_SHA
-ARG DOCKER_LABEL
-
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@ -25,7 +23,10 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    rm -f $PROTOC_ZIP

 COPY --from=planner /usr/src/recipe.json recipe.json
-RUN cargo chef cook --release --recipe-path recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL

 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
@ -33,17 +34,17 @@ COPY proto proto
 COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt

 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM debian:bullseye-slim as pytorch-install
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install

-ARG PYTORCH_VERSION=2.0.1
-ARG PYTHON_VERSION=3.9
+ARG PYTORCH_VERSION=2.3.0
+ARG PYTHON_VERSION=3.10
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=11.8
-ARG MAMBA_VERSION=23.1.0-1
+ARG CUDA_VERSION=12.1
+ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
@ -75,22 +76,21 @@ RUN chmod +x ~/mambaforge.sh && \
 RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  exit 1 ;; \
         *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
    esac && \
    /opt/conda/bin/conda clean -ya

 # CUDA kernels builder image
-FROM pytorch-install as kernel-builder
+FROM pytorch-install AS kernel-builder
+
+ARG MAX_JOBS=8

 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        ninja-build \
+        ninja-build cmake \
        && rm -rf /var/lib/apt/lists/*

-RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
-    /opt/conda/bin/conda clean -ya
-
 # Build Flash Attention CUDA kernels
-FROM kernel-builder as flash-att-builder
+FROM kernel-builder AS flash-att-builder

 WORKDIR /usr/src

@ -100,48 +100,85 @@ COPY server/Makefile-flash-att Makefile
 RUN make build-flash-attention

 # Build Flash Attention v2 CUDA kernels
-FROM kernel-builder as flash-att-v2-builder
+FROM kernel-builder AS flash-att-v2-builder

 WORKDIR /usr/src

 COPY server/Makefile-flash-att-v2 Makefile

 # Build specific version of flash attention v2
-RUN make build-flash-attention-v2
+RUN make build-flash-attention-v2-cuda

 # Build Transformers exllama kernels
-FROM kernel-builder as exllama-kernels-builder
-
+FROM kernel-builder AS exllama-kernels-builder
 WORKDIR /usr/src
-
 COPY server/exllama_kernels/ .

+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Transformers exllama kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/exllamav2_kernels/ .

 # Build specific version of transformers
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

-# Build Transformers CUDA kernels
-FROM kernel-builder as custom-kernels-builder
-
+# Build Transformers awq kernels
+FROM kernel-builder AS awq-kernels-builder
 WORKDIR /usr/src
+COPY server/Makefile-awq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

+# Build eetq kernels
+FROM kernel-builder AS eetq-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-eetq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+
+# Build marlin kernels
+FROM kernel-builder AS marlin-kernels-builder
+WORKDIR /usr/src
+COPY server/marlin/ .
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Lorax Punica kernels
+FROM kernel-builder AS lorax-punica-builder
+WORKDIR /usr/src
+COPY server/Makefile-lorax-punica Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
+
+# Build Transformers CUDA kernels
+FROM kernel-builder AS custom-kernels-builder
+WORKDIR /usr/src
 COPY server/custom_kernels/ .
-
 # Build specific version of transformers
 RUN python setup.py build

 # Build vllm CUDA kernels
-FROM kernel-builder as vllm-builder
+FROM kernel-builder AS vllm-builder

 WORKDIR /usr/src

+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
 COPY server/Makefile-vllm Makefile

 # Build specific version of vllm
-RUN make build-vllm
+RUN make build-vllm-cuda
+
+# Build mamba kernels
+FROM kernel-builder AS mamba-builder
+WORKDIR /usr/src
+COPY server/Makefile-selective-scan Makefile
+RUN make build-all

 # Text Generation Inference base image
-FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base

 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@ -158,26 +195,41 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        libssl-dev \
        ca-certificates \
        make \
+        curl \
+        git \
        && rm -rf /var/lib/apt/lists/*

 # Copy conda with PyTorch installed
 COPY --from=pytorch-install /opt/conda /opt/conda

 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages

 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from marlin kernels builder
+COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

 # Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
@ -188,23 +240,27 @@ COPY server server
 COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
-    pip install -r requirements.txt && \
-    pip install ".[bnb, accelerate, quantize]" --no-cache-dir
-
-# Install benchmarker
-COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
-# Install router
-COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
-# Install launcher
-COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+    pip install -r requirements_cuda.txt && \
+    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir

+# Deps before the binaries
+# The binaries change on every build given we burn the SHA into them
+# The deps change less often.
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        build-essential \
        g++ \
        && rm -rf /var/lib/apt/lists/*

-# AWS Sagemaker compatbile image
-FROM base as sagemaker
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker

 COPY sagemaker-entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
@ -214,5 +270,8 @@ ENTRYPOINT ["./entrypoint.sh"]
 # Final image
 FROM base

-ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+# CMD ["--json-output"]
--- a/218
+++ b/218
@ -0,0 +1,218 @@
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+# Text Generation Inference base image for RoCm
+FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update AS base
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    ccache \
+    curl \
+    git \
+    make \
+    libssl-dev \
+    g++ \
+    # Needed to build VLLM & flash.
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblas-dev \
+    hipblaslt-dev \
+    rocblas-dev \
+    hiprand-dev \
+    rocrand-dev \
+    miopen-hip-dev \
+    hipfft-dev \
+    hipcub-dev \
+    hipsolver-dev \
+    rccl-dev \
+    cmake \
+    python3-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Keep in sync with `server/pyproject.toml
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTORCH_VERSION='2.3.0'
+ARG ROCM_VERSION='6.0.2'
+ARG PYTHON_VERSION='3.10.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH /opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    mamba init && \
+    rm ~/mambaforge.sh
+
+# Install flash-attention, torch dependencies
+RUN pip install numpy einops ninja --no-cache-dir
+
+RUN conda install intel::mkl-static intel::mkl-include
+RUN pip uninstall -y triton && \
+    git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
+    cd triton/python && \
+    pip install .
+
+RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
+
+ARG _GLIBCXX_USE_CXX11_ABI="1"
+ARG CMAKE_PREFIX_PATH="/opt/conda"
+ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ARG BUILD_CAFFE2="0" \
+    BUILD_CAFFE2_OPS="0" \
+    USE_CUDA="0" \
+    USE_ROCM="1" \
+    BUILD_TEST="0" \
+    USE_FBGEMM="0" \
+    USE_NNPACK="0" \
+    USE_QNNPACK="0" \
+    USE_XNNPACK="0" \
+    USE_FLASH_ATTENTION="1" \
+    USE_MEM_EFF_ATTENTION="0"
+
+RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
+
+# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
+ENV HIP_FORCE_DEV_KERNARG=1
+
+# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
+# However, Triton requires a tunning for each prompt length, which is prohibitive.
+ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
+
+FROM base AS kernel-builder
+
+# # Build vllm kernels
+FROM kernel-builder AS vllm-builder
+WORKDIR /usr/src
+
+COPY server/Makefile-vllm Makefile
+
+# Build specific version of vllm
+RUN make build-vllm-rocm
+
+# Build Flash Attention v2 kernels
+FROM kernel-builder AS flash-att-v2-builder
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att-v2 Makefile
+
+# Build specific version of flash attention v2
+RUN make build-flash-attention-v2-rocm
+
+# Build Transformers CUDA kernels (gpt-neox and bloom)
+FROM kernel-builder AS custom-kernels-builder
+WORKDIR /usr/src
+COPY server/custom_kernels/ .
+RUN python setup.py build
+
+# Build exllama kernels
+FROM kernel-builder AS exllama-kernels-builder
+WORKDIR /usr/src
+COPY server/exllama_kernels/ .
+
+RUN python setup.py build
+
+# Build exllama v2 kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/exllamav2_kernels/ .
+
+RUN python setup.py build
+
+FROM base AS base-copy
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Copy builds artifacts from vllm builder
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_rocm.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
+
+COPY sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+# Final image
+FROM base-copy
+
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+CMD ["--json-output"]
--- a/177
+++ b/177
@ -0,0 +1,177 @@
+ARG PLATFORM=xpu
+
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+
+# Text Generation Inference base image for Intel
+
+FROM intel/intel-extension-for-pytorch:2.1.30-xpu AS xpu
+
+USER root
+# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+
+WORKDIR /usr/src
+RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl && pip install torch-2.1.0.post1+cxx11.abi-cp310-cp310-linux_x86_64.whl
+RUN pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b distributed origin/dev/distributed
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_intel.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
+ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
+ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
+ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV CCL_ZE_IPC_EXCHANGE=sockets
+ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
+ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
+
+RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# Text Generation Inference base image for Intel-cpu
+FROM ubuntu:22.04 AS cpu
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    curl \
+    ca-certificates \
+    make \
+    g++ \
+    git \
+    wget \
+    cmake
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTHON_VERSION='3.10.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH /opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN conda install -c conda-forge gperftools mkl
+
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
+RUN pip install triton
+
+WORKDIR /usr/src
+
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout eda7a7c42df6f9a64e0de9c2b69304ee02f2c32a
+
+RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout ccl_torch_dev_0131
+
+RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
+
+RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
+
+ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so:/opt/conda/lib/libiomp5.so
+ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
+ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
+ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
+ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
+ENV KMP_BLOCKTIME=1
+ENV KMP_TPAUSE=0
+ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
+ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
+ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_intel.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+FROM ${PLATFORM} AS final
+ENTRYPOINT ["text-generation-launcher"]
+CMD ["--json-output"]
--- a/318
+++ b/318
@ -1,181 +1,201 @@
-Hugging Face Optimized Inference License 1.0 (HFOILv1.0)
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-This License Agreement governs the use of the Software and its Modifications. It is a
-binding agreement between the Licensor and You.
+   1. Definitions.

-This License Agreement shall be referred to as Hugging Face Optimized Inference License
-1.0 or HFOILv1.0. We may publish revised versions of this License Agreement from time to
-time. Each version will be given a distinguished number.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-By downloading, accessing, modifying, distributing or otherwise using the Software, You
-consent to all of the terms and conditions below. So, if You do not agree with those,
-please do not download, access, modify, distribute, or use the Software.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.

+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.

-1. PERMISSIONS
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.

-You may use, modify and distribute the Software pursuant to the following terms and
-conditions:
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.

-Copyright License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free, copyright license to reproduce, prepare, publicly display,
-publicly perform, sublicense under the terms herein, and distribute the Software and
-Modifications of the Software.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.

-Patent License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free patent license to make, have made, Use, offer to sell, sell,
-import, and otherwise transfer the Software, where such license applies only to those
-patent claims licensable by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s) with the Software to
-which such Contribution(s) was submitted. If You institute patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Software
-or a Contribution incorporated within the Software constitutes direct or contributory
-patent infringement, then any rights granted to You under this License Agreement for the
-Software shall terminate as of the date such litigation is filed.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).

-No other rights. All rights not expressly granted herein are retained.
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.

+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."

-2. RESTRICTIONS
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.

-You may not distribute the Software as a hosted or managed, and paid service, where the
-service grants users access to any substantial set of the features or functionality of the
-Software. If you wish to do so, You will need to be granted additional rights from the
-Licensor which will be subject to a separate mutually agreed agreement.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.

-You may not sublicense the Software under any other terms than those listed in this
-License.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.

+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:

-3. OBLIGATIONS
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and

-When You modify the Software, You agree to: - attach a notice stating the Modifications of
-the Software You made; and - attach a notice stating that the Modifications of the
-Software are released under this License Agreement.
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and

-When You distribute the Software or Modifications of the Software, You agree to: - give
-any recipients of the Software a copy of this License Agreement; - retain all Explanatory
-Documentation; and if sharing the Modifications of the Software, add Explanatory
-Documentation documenting the changes made to create the Modifications of the Software; -
-retain all copyright, patent, trademark and attribution notices.
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and

+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.

-4. MISCELLANEOUS
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.

-Termination. Licensor reserves the right to restrict Use of the Software in violation of
-this License Agreement, upon which Your licenses will automatically terminate.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.

-Contributions. Unless You explicitly state otherwise, any Contribution intentionally
-submitted for inclusion in the Software by You to the Licensor shall be under the terms
-and conditions of this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify the terms of any
-separate license agreement you may have executed with Licensor regarding such
-Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.

-Trademarks and related. Nothing in this License Agreement permits You (i) to make Use of
-Licensors’ trademarks, trade names, or logos, (ii) otherwise suggest endorsement by
-Licensor, or (iii) misrepresent the relationship between the parties; and any rights not
-expressly granted herein are reserved by the Licensors.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.

-Output You generate. Licensor claims no rights in the Output. You agree not to contravene
-any provision as stated in the License Agreement with your Use of the Output.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.

-Disclaimer of Warranty. Except as expressly provided otherwise herein, and to the fullest
-extent permitted by law, Licensor provides the Software (and each Contributor provides its
-Contributions) AS IS, and Licensor disclaims all warranties or guarantees of any kind,
-express or implied, whether arising under any law or from any usage in trade, or otherwise
-including but not limited to the implied warranties of merchantability, non-infringement,
-quiet enjoyment, fitness for a particular purpose, or otherwise. You are solely
-responsible for determining the appropriateness of the Software and Modifications of the
-Software for your purposes (including your use or distribution of the Software and
-Modifications of the Software), and assume any risks associated with Your exercise of
-permissions under this License Agreement.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.

-Limitation of Liability. In no event and under no legal theory, whether in tort (including
-negligence), contract, or otherwise, unless required by applicable law (such as deliberate
-and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to
-You for damages, including any direct, indirect, special, incidental, or consequential
-damages of any character arising as a result of this License Agreement or out of the Use
-or inability to Use the Software (including but not limited to damages for loss of
-goodwill, work stoppage, computer failure or malfunction, model failure or malfunction, or
-any and all other commercial damages or losses), even if such Contributor has been advised
-of the possibility of such damages.
+   END OF TERMS AND CONDITIONS

-Accepting Warranty or Additional Liability. While sharing the Software or Modifications of
-the Software thereof, You may choose to offer and charge a fee for, acceptance of support,
-warranty, indemnity, or other liability obligations and/or rights consistent with this
-License Agreement. However, in accepting such obligations, You may act only on Your own
-behalf and on Your sole responsibility, not on behalf of Licensor or any other
-Contributor, and you hereby agree to indemnify, defend, and hold Licensor and each other
-Contributor (and their successors or assigns) harmless for any liability incurred by, or
-claims asserted against, such Licensor or Contributor (and their successors or assigns) by
-reason of your accepting any such warranty or additional liability.
+   APPENDIX: How to apply the Apache License to your work.

-Severability. This License Agreement is a license of copyright and patent rights and an
-agreement in contract between You and the Licensor. If any provision of this License
-Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall
-be unaffected thereby and remain valid as if such provision had not been set forth herein.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.

+   Copyright 2022 Hugging Face

-5. DEFINITIONS
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at

-“Contribution” refers to any work of authorship, including the original version of the
-Software and any Modifications of the Software that is intentionally submitted to Licensor
-for inclusion in the Software by the copyright owner or by an individual or entity
-authorized to submit on behalf of the copyright owner. For the purposes of this
-definition, “submitted” means any form of electronic, verbal, or written communication
-sent to the Licensor or its representatives, including but not limited to communication on
-electronic mailing lists, source code control systems, and issue tracking systems that are
-managed by, or on behalf of, the Licensor for the purpose of discussing and improving the
-Software, but excluding communication that is conspicuously marked or otherwise designated
-in writing by the copyright owner as “Not a Contribution.”
+       http://www.apache.org/licenses/LICENSE-2.0

-“Contributor” refers to Licensor and any individual or entity on behalf of whom a
-Contribution has been received by Licensor and subsequently incorporated within the
-Software.
-
-“Data” refers to a collection of information extracted from the dataset used with the
-Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not
-licensed under this License Agreement.
-
-“Explanatory Documentation” refers to any documentation or related information including
-but not limited to model cards or data cards dedicated to inform the public about the
-characteristics of the Software. Explanatory documentation is not licensed under this
-License.
-
-"License Agreement" refers to these terms and conditions.
-
-“Licensor” refers to the rights owners or entity authorized by the rights owners that are
-granting the terms and conditions of this License Agreement.
-
-“Model” refers to machine-learning based assemblies (including checkpoints), consisting of
-learnt weights and parameters (including optimizer states), corresponding to a model
-architecture as embodied in Software source code. Source code is not licensed under this
-License Agreement.
-
-“Modifications of the Software” refers to all changes to the Software, including without
-limitation derivative works of the Software.
-
-“Output” refers to the results of operating the Software.
-
-“Share” refers to any transmission, reproduction, publication or other sharing of the
-Software or Modifications of the Software to a third party, including providing the
-Softwaire as a hosted service made available by electronic or other remote means,
-including - but not limited to - API-based or web access.
-
-“Software” refers to the software and Model (or parts of either) that Licensor makes
-available under this License Agreement.
-
-“Third Parties” refers to individuals or legal entities that are not under common control
-with Licensor or You.
-
-“Use” refers to anything You or your representatives do with the Software, including but
-not limited to generating any Output, fine tuning, updating, running, training, evaluating
-and/or reparametrizing the Model.
-
-"You" (or "Your")  refers to an individual or Legal Entity exercising permissions granted
-by this License Agreement and/or making Use of the Software for whichever purpose and in
-any field of Use.
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/17
+++ b/17
@ -1,12 +1,8 @@
 install-server:
 	cd server && make install

-install-custom-kernels:
-	if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
-
-install-integration-tests:
-	cd integration-tests && pip install -r requirements.txt
-	cd clients/python && pip install .
+install-server-cpu:
+	cd server && make install-server

 install-router:
 	cd router && cargo install --path .
@ -17,7 +13,10 @@ install-launcher:
 install-benchmark:
 	cd benchmark && cargo install --path .

-install: install-server install-router install-launcher install-custom-kernels
+install: install-server install-router install-launcher
+
+
+install-cpu: install-server-cpu install-router install-launcher

 server-dev:
 	cd server && make run-dev
@ -28,6 +27,10 @@ router-dev:
 rust-tests: install-router install-launcher
 	cargo test

+install-integration-tests:
+	cd integration-tests && pip install -r requirements.txt
+	cd clients/python && pip install .
+
 integration-tests: install-integration-tests
 	pytest -s -vv -m "not private" integration-tests

--- a/README.md
+++ b/README.md
@ -1,6 +1,8 @@
 <div align="center">

-![image](https://github.com/huggingface/text-generation-inference/assets/3841370/38ba1531-ea0d-4851-b31a-a6d4ddc944b0)
+<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
+  <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
+</a>

 # Text Generation Inference

@ -18,116 +20,84 @@ to power Hugging Chat, the Inference API and Inference Endpoint.

 ## Table of contents

- [Features](#features)
- [Optimized Architectures](#optimized-architectures)
 - [Get Started](#get-started)
-  - [Docker](#docker)
  - [API Documentation](#api-documentation)
  - [Using a private or gated model](#using-a-private-or-gated-model)
  - [A note on Shared Memory](#a-note-on-shared-memory-shm)
  - [Distributed Tracing](#distributed-tracing)
  - [Local Install](#local-install)
  - [CUDA Kernels](#cuda-kernels)
- [Run Falcon](#run-falcon)
+- [Optimized architectures](#optimized-architectures)
+- [Run Mistral](#run-a-model)
  - [Run](#run)
  - [Quantization](#quantization)
 - [Develop](#develop)
 - [Testing](#testing)
- [Other supported hardware](#other-supported-hardware)

-## Features
+Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:

- Serve the most popular Large Language Models with a simple launcher
+- Simple launcher to serve most popular LLMs
+- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
 - Tensor Parallelism for faster inference on multiple GPUs
 - Token streaming using Server-Sent Events (SSE)
- [Continuous batching of incoming requests](https://github.com/huggingface/text-generation-inference/tree/main/router) for increased total throughput
- Optimized transformers code for inference using [flash-attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323)
+- Continuous batching of incoming requests for increased total throughput
+- Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
+- Quantization with :
+  - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+  - [GPT-Q](https://arxiv.org/abs/2210.17323)
+  - [EETQ](https://github.com/NetEase-FuXi/EETQ)
+  - [AWQ](https://github.com/casper-hansen/AutoAWQ)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
 - Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 - Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
 - Stop sequences
 - Log probabilities
- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output.
- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance.
+- [Speculation](https://huggingface.co/docs/text-generation-inference/conceptual/speculation) ~2x latency
+- [Guidance/JSON](https://huggingface.co/docs/text-generation-inference/conceptual/guidance). Specify output format to speed up inference and make sure the output is valid according to some specs..
+- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
+- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance
+
+### Hardware support
+
+- [Nvidia](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference)
+- [AMD](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference) (-rocm)
+- [Inferentia](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference)
+- [Intel GPU](https://github.com/huggingface/text-generation-inference/pull/1475)
+- [Gaudi](https://github.com/huggingface/tgi-gaudi)
+- [Google TPU](https://huggingface.co/docs/optimum-tpu/howto/serving)


-## Optimized architectures
-
- [BLOOM](https://huggingface.co/bigscience/bloom)
- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl)
- [Galactica](https://huggingface.co/facebook/galactica-120b)
- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
- [Llama](https://github.com/facebookresearch/llama)
- [OPT](https://huggingface.co/facebook/opt-66b)
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
- [Starcoder](https://huggingface.co/bigcode/starcoder)
- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
- [MPT](https://huggingface.co/mosaicml/mpt-30b)
- [Llama V2](https://huggingface.co/meta-llama)
-
-Other architectures are supported on a best effort basis using:
-
-`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
-
-or
-
-`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
-
-## Get started
+## Get Started

 ### Docker

-The easiest way of getting started is using the official Docker container:
+For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:

 ```shell
-model=tiiuae/falcon-7b-instruct
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+model=HuggingFaceH4/zephyr-7b-beta
+# share a volume with the Docker container to avoid downloading weights every run
+volume=$PWD/data

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model
-```
-**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
-
-To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
-```
-text-generation-launcher --help
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1 --model-id $model
 ```

-You can then query the model using either the `/generate` or `/generate_stream` routes:
+And then you can make requests like

-```shell
-curl 127.0.0.1:8080/generate \
-    -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-    -H 'Content-Type: application/json'
-```
-
-```shell
+```bash
 curl 127.0.0.1:8080/generate_stream \
    -X POST \
    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
    -H 'Content-Type: application/json'
 ```

-or from Python:
+**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.

-```shell
-pip install text-generation
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.1.1-rocm --model-id $model` instead of the command above.
+
+To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
-
-```python
-from text_generation import Client
-
-client = Client("http://127.0.0.1:8080")
-print(client.generate("What is Deep Learning?", max_new_tokens=20).generated_text)
-
-text = ""
-for response in client.generate_stream("What is Deep Learning?", max_new_tokens=20):
-    if not response.token.special:
-        text += response.token.text
-print(text)
+text-generation-launcher --help
 ```

 ### API documentation
@ -137,14 +107,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat

 ### Using a private or gated model

-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
 `text-generation-inference`. This allows you to gain access to protected resources.

 For example, if you want to serve the gated Llama V2 model variants:

 1. Go to https://huggingface.co/settings/tokens
 2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+3. Export `HF_TOKEN=<your cli READ token>`

 or with Docker:

@ -153,7 +123,7 @@ model=meta-llama/Llama-2-7b-chat-hf
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>

-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```

 ### A note on Shared Memory (shm)
@ -185,7 +155,12 @@ this will impact performance.
 ### Distributed Tracing

 `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
-by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
+overridden with the `--otlp-service-name` argument
+
+### Architecture
+
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)

 ### Local install

@ -197,7 +172,7 @@ Python 3.9, e.g. using `conda`:
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

-conda create -n text-generation-inference python=3.9
+conda create -n text-generation-inference python=3.11
 conda activate text-generation-inference
 ```

@ -223,7 +198,7 @@ Then run:

 ```shell
 BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
-make run-falcon-7b-instruct
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```

 **Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
@ -232,19 +207,26 @@ make run-falcon-7b-instruct
 sudo apt-get install libssl-dev gcc -y
 ```

-### CUDA Kernels
+## Optimized architectures

-The custom CUDA kernels are only tested on NVIDIA A100s. If you have any installation or runtime issues, you can remove
-the kernels by using the `DISABLE_CUSTOM_KERNELS=True` environment variable.
+TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).

-Be aware that the official Docker image has them enabled by default.
+Other architectures are supported on a best-effort basis using:

-## Run Falcon
+`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
+
+or
+
+`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
+
+
+
+## Run locally

 ### Run

 ```shell
-make run-falcon-7b-instruct
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```

 ### Quantization
@ -252,7 +234,7 @@ make run-falcon-7b-instruct
 You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:

 ```shell
-make run-falcon-7b-instruct-quantize
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
 ```

 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.
@ -277,10 +259,3 @@ make rust-tests
 # integration tests
 make integration-tests
 ```
-
-
-## Other supported hardware
-
-TGI is also supported on the following AI hardware accelerators:
- *Habana first-gen Gaudi and Gaudi2:* checkout [here](https://github.com/huggingface/optimum-habana/tree/main/text-generation-inference) how to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
-
--- a/assets/architecture.jpg
+++ b/assets/architecture.jpg
--- a/assets/architecture.png
+++ b/assets/architecture.png
--- a/assets/tgi_grafana.json
+++ b/assets/tgi_grafana.json
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -14,18 +14,18 @@ name = "text-generation-benchmark"
 path = "src/main.rs"

 [dependencies]
-average = "0.13"
-clap = { version = "4.1.4", features = ["derive", "env"] }
-crossterm = "0.26"
+average = "0.14"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+crossterm = "0.27"
 float-ord = "0.3.2"
-serde = {version = "1.0.142", features = ["derive"]}
+serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
-tabled = "0.12.0"
+tabled = "0.14.0"
 text-generation-client = { path = "../router/client" }
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
+thiserror = "1.0.48"
+tokenizers = { workspace = true }
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
+tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
-
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+hf-hub = { workspace = true }
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -6,12 +6,12 @@

 </div>

-A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) 
+A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
 and powered by [tui](https://github.com/tui-rs-revival/ratatui).

-## Install 
+## Install

-```shell 
+```shell
 make install-benchmark
 ```

@ -27,4 +27,4 @@ Then run the benchmarking tool:

 ```shell
 text-generation-benchmark --tokenizer-name bigscience/bloom-560m
-```
+```
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@ -6,7 +6,7 @@ use tokio::sync::mpsc;
 use tui::backend::Backend;
 use tui::layout::{Alignment, Constraint, Direction, Layout};
 use tui::style::{Color, Modifier, Style};
-use tui::text::{Span, Spans};
+use tui::text::{Line, Span};
 use tui::widgets::{
    Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
 };
@ -244,7 +244,7 @@ impl App {
            .batch_size
            .iter()
            .map(|b| {
-                Spans::from(vec![Span::styled(
+                Line::from(vec![Span::styled(
                    format!("Batch: {b}"),
                    Style::default().fg(Color::White),
                )])
@ -444,7 +444,7 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga
 }

 /// Throughput paragraph
-fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
+fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> Paragraph<'a> {
    // Throughput average/high/low texts
    let throughput_texts = statis_spans(throughput, "tokens/secs");

@ -457,7 +457,7 @@ fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragr
 }

 /// Latency paragraph
-fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
+fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Paragraph<'a> {
    // Latency average/high/low texts
    let mut latency_texts = statis_spans(latency, "ms");

@ -466,9 +466,9 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
    let latency_percentiles = crate::utils::percentiles(latency, &[50, 90, 99]);

    // Latency p50/p90/p99 texts
-    let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
+    let colors = [Color::LightGreen, Color::LightYellow, Color::LightRed];
    for (i, (name, value)) in latency_percentiles.iter().enumerate() {
-        let span = Spans::from(vec![Span::styled(
+        let span = Line::from(vec![Span::styled(
            format!("{name}:     {value:.2} ms"),
            Style::default().fg(colors[i]),
        )]);
@ -483,30 +483,30 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
 }

 /// Average/High/Low spans
-fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
+fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
    vec![
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
            format!(
                "Average: {:.2} {unit}",
                data.iter().sum::<f64>() / data.len() as f64
            ),
            Style::default().fg(Color::LightBlue),
        )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
            format!(
                "Lowest:  {:.2} {unit}",
                data.iter()
                    .min_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
            format!(
                "Highest: {:.2} {unit}",
                data.iter()
                    .max_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
@ -543,7 +543,7 @@ fn latency_histogram<'a>(

 /// Latency/Throughput chart
 fn latency_throughput_chart<'a>(
-    latency_throughput: &'a Vec<(f64, f64)>,
+    latency_throughput: &'a [(f64, f64)],
    batch_sizes: &'a [u32],
    zoom: bool,
    name: &'static str,
@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>(
    let min_latency: f64 = *latency_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max_latency: f64 = *latency_iter
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let min_throughput: f64 = *throughput_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max_throughput: f64 = *throughput_iter
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);

    // Char min max values
    let min_x = if zoom {
--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@ -11,7 +11,7 @@ pub(crate) enum Event {
    /// Key press.
    Key(event::KeyEvent),
    /// Terminal resize.
-    Resize(u16, u16),
+    Resize,
 }

 pub(crate) async fn terminal_event_task(
@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
        if event::poll(Duration::from_secs(0)).expect("no events available") {
            match event::read().expect("unable to read event") {
                event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
-                event::Event::Resize(w, h) => {
-                    event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
+                event::Event::Resize(_w, _h) => {
+                    event_sender.send(Event::Resize).await.unwrap_or(())
                }
                _ => (),
            }
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -1,8 +1,9 @@
 use std::time::{Duration, Instant};
-use text_generation_client::{
-    Batch, CachedBatch, ClientError, NextTokenChooserParameters, Request, ShardedClient,
+use text_generation_client::v3::{
+    Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
    StoppingCriteriaParameters,
 };
+use text_generation_client::{Chunk, ClientError, Input};
 use tokenizers::{Tokenizer, TruncationDirection};
 use tokio::sync::{broadcast, mpsc};

@ -142,6 +143,9 @@ async fn prefill(
        .map(|id| Request {
            id: id.into(),
            prefill_logprobs: false,
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text(sequence.clone()).into()],
+            }),
            inputs: sequence.clone(),
            truncate: sequence_length,
            parameters: Some(parameters.clone()),
@ -151,6 +155,9 @@ async fn prefill(
                ignore_eos_token: true, // Will not stop even if a eos token is generated
            }),
            top_n_tokens: top_n_tokens.unwrap_or(0),
+            blocks: vec![],
+            slots: vec![],
+            adapter_id: None,
        })
        .collect();

@ -159,11 +166,12 @@ async fn prefill(
        requests,
        size: batch_size,
        max_tokens: batch_size * (sequence_length + decode_length),
+        max_blocks: 0,
    };

    // Run prefill
    let start_time = Instant::now();
-    let (_, decode_batch) = client.prefill(batch.clone()).await?;
+    let (_, decode_batch, _) = client.prefill(batch.clone()).await?;

    // Get latency
    let latency = start_time.elapsed();
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{NextTokenChooserParameters, ShardedClient};
+use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;
@ -30,10 +30,11 @@ pub async fn run(
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
+    frequency_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
    client: ShardedClient,
-) -> Result<(), crossterm::ErrorKind> {
+) -> Result<(), std::io::Error> {
    let parameters = NextTokenChooserParameters {
        temperature: temperature.unwrap_or(1.0),
        top_k: top_k.unwrap_or(0),
@ -42,7 +43,10 @@ pub async fn run(
        do_sample,
        seed: 0,
        repetition_penalty: repetition_penalty.unwrap_or(1.0),
+        frequency_penalty: frequency_penalty.unwrap_or(0.0),
        watermark,
+        grammar: String::new(),
+        grammar_type: GrammarType::None as i32,
    };

    // Initialize terminal properties
@ -140,6 +144,7 @@ pub async fn run(
        top_p,
        typical_p,
        repetition_penalty,
+        frequency_penalty,
        watermark,
        do_sample,
    );
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@ -4,7 +4,7 @@
 /// and: https://github.com/orhun/rust-tui-template
 use clap::Parser;
 use std::path::Path;
-use text_generation_client::ShardedClient;
+use text_generation_client::v3::ShardedClient;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
@ -84,6 +84,11 @@ struct Args {
    #[clap(long, env)]
    repetition_penalty: Option<f32>,

+    /// Generation parameter in case you want to specifically test/debug particular
+    /// decoding strategies, for full doc refer to the `text-generation-server`
+    #[clap(long, env)]
+    frequency_penalty: Option<f32>,
+
    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
@ -119,6 +124,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        top_p,
        typical_p,
        repetition_penalty,
+        frequency_penalty,
        watermark,
        do_sample,
        master_shard_uds_path,
@ -141,7 +147,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
            tracing::info!("Downloading tokenizer");

            // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN")
+                .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+                .ok();

            // Download and instantiate tokenizer
            // We need to download it outside of the Tokio runtime
@ -187,6 +195,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                top_p,
                typical_p,
                repetition_penalty,
+                frequency_penalty,
                watermark,
                do_sample,
                sharded_client,
--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@ -15,6 +15,7 @@ pub(crate) fn parameters_table(
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
+    frequency_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
 ) -> Table {
@ -33,6 +34,7 @@ pub(crate) fn parameters_table(
    builder.push_record(["Top P", &format!("{top_p:?}")]);
    builder.push_record(["Typical P", &format!("{typical_p:?}")]);
    builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
+    builder.push_record(["Frequency Penalty", &format!("{frequency_penalty:?}")]);
    builder.push_record(["Watermark", &watermark.to_string()]);
    builder.push_record(["Do Sample", &do_sample.to_string()]);

@ -149,22 +151,22 @@ fn add_throuhgputs(
    }
 }

-fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
+fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
    let average = data.iter().sum::<f64>() / data.len() as f64;
    let min = data
        .iter()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max = data
        .iter()
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    (average, *min, *max)
 }

-fn px(data: &Vec<f64>, p: u32) -> f64 {
+fn px(data: &[f64], p: u32) -> f64 {
    let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
-    *data.get(i).unwrap_or(&std::f64::NAN)
+    *data.get(i).unwrap_or(&f64::NAN)
 }

 fn format_value(value: f64, unit: &'static str) -> String {
--- a/benchmark/src/utils.rs
+++ b/benchmark/src/utils.rs
@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f
        .iter()
        .map(|&p| {
            let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
-            (format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
+            (format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
        })
        .collect()
 }
--- a/clients/python/.gitignore
+++ b/clients/python/.gitignore
@ -155,4 +155,4 @@ dmypy.json
 cython_debug/

 transformers
-safetensors
+safetensors
--- a/clients/python/Makefile
+++ b/clients/python/Makefile
@ -3,4 +3,4 @@ unit-tests:

 install:
 	pip install pip --upgrade
-	pip install -e .
+	pip install -e .
--- a/clients/python/README.md
+++ b/clients/python/README.md
@ -107,7 +107,19 @@ print(text)
 ### Types

 ```python
-# Request Parameters
+# enum for grammar type
+class GrammarType(Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar:
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
 class Parameters:
    # Activate logits sampling
    do_sample: bool
@ -116,6 +128,10 @@ class Parameters:
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float]
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float]
    # Whether to prepend the prompt to the generated text
    return_full_text: bool
    # Stop generating tokens if a member of `stop_sequences` is generated
@ -138,8 +154,22 @@ class Parameters:
    best_of: Optional[int]
    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
    watermark: bool
+    # Get generation details
+    details: bool
    # Get decoder input token logprobs and ids
    decoder_input_details: bool
+    # Return the N most likely tokens at each step
+    top_n_tokens: Optional[int]
+    # grammar to use for generation
+    grammar: Optional[Grammar]
+
+class Request:
+    # Prompt
+    inputs: str
+    # Generation parameters
+    parameters: Optional[Parameters]
+    # Whether to stream output tokens
+    stream: bool

 # Decoder input tokens
 class InputToken:
@ -159,7 +189,7 @@ class Token:
    # Token text
    text: str
    # Logprob
-    logprob: float
+    logprob: Optional[float]
    # Is the token a special token
    # Can be used to ignore tokens when concatenating
    special: bool
@ -189,6 +219,8 @@ class BestOfSequence:
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]


 # `generate` details
@ -203,6 +235,8 @@ class Details:
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
    # Additional sequences when using the `best_of` parameter
    best_of_sequences: Optional[List[BestOfSequence]]

@ -229,6 +263,8 @@ class StreamDetails:
 class StreamResponse:
    # Generated token
    token: Token
+    # Most likely tokens
+    top_tokens: Optional[List[Token]]
    # Complete generated text
    # Only available when the generation is finished
    generated_text: Optional[str]
@ -240,4 +276,4 @@ class StreamResponse:
 class DeployedModel:
    model_id: str
    sha: str
-```
+```
--- a/clients/python/poetry.lock
+++ b/clients/python/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@ -124,6 +124,20 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"

+[[package]]
+name = "annotated-types"
+version = "0.5.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "annotated_types-0.5.0-py3-none-any.whl", hash = "sha256:58da39888f92c276ad970249761ebea80ba544b77acddaa1a4d6cf78287d45fd"},
+    {file = "annotated_types-0.5.0.tar.gz", hash = "sha256:47cdc3490d9ac1506ce92c7aaa76c579dc3509ff11e098fc867e5130ab7be802"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""}
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@ -693,55 +707,140 @@ files = [

 [[package]]
 name = "pydantic"
-version = "1.10.12"
-description = "Data validation and settings management using python type hints"
+version = "2.5.3"
+description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"},
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"},
-    {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"},
-    {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"},
-    {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"},
-    {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"},
-    {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"},
-    {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"},
-    {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"},
-    {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"},
+    {file = "pydantic-2.5.3-py3-none-any.whl", hash = "sha256:d0caf5954bee831b6bfe7e338c32b9e30c85dfe080c843680783ac2b631673b4"},
+    {file = "pydantic-2.5.3.tar.gz", hash = "sha256:b3ef57c62535b0941697cce638c08900d87fcb67e29cfa99e8a68f747f393f7a"},
 ]

 [package.dependencies]
-typing-extensions = ">=4.2.0"
+annotated-types = ">=0.4.0"
+importlib-metadata = {version = "*", markers = "python_version == \"3.7\""}
+pydantic-core = "2.14.6"
+typing-extensions = ">=4.6.1"

 [package.extras]
-dotenv = ["python-dotenv (>=0.10.4)"]
-email = ["email-validator (>=1.0.3)"]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.14.6"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:72f9a942d739f09cd42fffe5dc759928217649f070056f03c70df14f5770acf9"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6a31d98c0d69776c2576dda4b77b8e0c69ad08e8b539c25c7d0ca0dc19a50d6c"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aa90562bc079c6c290f0512b21768967f9968e4cfea84ea4ff5af5d917016e4"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:370ffecb5316ed23b667d99ce4debe53ea664b99cc37bfa2af47bc769056d534"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f85f3843bdb1fe80e8c206fe6eed7a1caeae897e496542cee499c374a85c6e08"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9862bf828112e19685b76ca499b379338fd4c5c269d897e218b2ae8fcb80139d"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036137b5ad0cb0004c75b579445a1efccd072387a36c7f217bb8efd1afbe5245"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92879bce89f91f4b2416eba4429c7b5ca22c45ef4a499c39f0c5c69257522c7c"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0c08de15d50fa190d577e8591f0329a643eeaed696d7771760295998aca6bc66"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:36099c69f6b14fc2c49d7996cbf4f87ec4f0e66d1c74aa05228583225a07b590"},
+    {file = "pydantic_core-2.14.6-cp310-none-win32.whl", hash = "sha256:7be719e4d2ae6c314f72844ba9d69e38dff342bc360379f7c8537c48e23034b7"},
+    {file = "pydantic_core-2.14.6-cp310-none-win_amd64.whl", hash = "sha256:36fa402dcdc8ea7f1b0ddcf0df4254cc6b2e08f8cd80e7010d4c4ae6e86b2a87"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:dea7fcd62915fb150cdc373212141a30037e11b761fbced340e9db3379b892d4"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffff855100bc066ff2cd3aa4a60bc9534661816b110f0243e59503ec2df38421"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b027c86c66b8627eb90e57aee1f526df77dc6d8b354ec498be9a757d513b92b"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:00b1087dabcee0b0ffd104f9f53d7d3eaddfaa314cdd6726143af6bc713aa27e"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:75ec284328b60a4e91010c1acade0c30584f28a1f345bc8f72fe8b9e46ec6a96"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e1f4744eea1501404b20b0ac059ff7e3f96a97d3e3f48ce27a139e053bb370b"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2602177668f89b38b9f84b7b3435d0a72511ddef45dc14446811759b82235a1"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c8edaea3089bf908dd27da8f5d9e395c5b4dc092dbcce9b65e7156099b4b937"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:478e9e7b360dfec451daafe286998d4a1eeaecf6d69c427b834ae771cad4b622"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b6ca36c12a5120bad343eef193cc0122928c5c7466121da7c20f41160ba00ba2"},
+    {file = "pydantic_core-2.14.6-cp311-none-win32.whl", hash = "sha256:2b8719037e570639e6b665a4050add43134d80b687288ba3ade18b22bbb29dd2"},
+    {file = "pydantic_core-2.14.6-cp311-none-win_amd64.whl", hash = "sha256:78ee52ecc088c61cce32b2d30a826f929e1708f7b9247dc3b921aec367dc1b23"},
+    {file = "pydantic_core-2.14.6-cp311-none-win_arm64.whl", hash = "sha256:a19b794f8fe6569472ff77602437ec4430f9b2b9ec7a1105cfd2232f9ba355e6"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:667aa2eac9cd0700af1ddb38b7b1ef246d8cf94c85637cbb03d7757ca4c3fdec"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdee837710ef6b56ebd20245b83799fce40b265b3b406e51e8ccc5b85b9099b7"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c5bcf3414367e29f83fd66f7de64509a8fd2368b1edf4351e862910727d3e51"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:26a92ae76f75d1915806b77cf459811e772d8f71fd1e4339c99750f0e7f6324f"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a983cca5ed1dd9a35e9e42ebf9f278d344603bfcb174ff99a5815f953925140a"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cb92f9061657287eded380d7dc455bbf115430b3aa4741bdc662d02977e7d0af"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4ace1e220b078c8e48e82c081e35002038657e4b37d403ce940fa679e57113b"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef633add81832f4b56d3b4c9408b43d530dfca29e68fb1b797dcb861a2c734cd"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e90d6cc4aad2cc1f5e16ed56e46cebf4877c62403a311af20459c15da76fd91"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e8a5ac97ea521d7bde7621d86c30e86b798cdecd985723c4ed737a2aa9e77d0c"},
+    {file = "pydantic_core-2.14.6-cp312-none-win32.whl", hash = "sha256:f27207e8ca3e5e021e2402ba942e5b4c629718e665c81b8b306f3c8b1ddbb786"},
+    {file = "pydantic_core-2.14.6-cp312-none-win_amd64.whl", hash = "sha256:b3e5fe4538001bb82e2295b8d2a39356a84694c97cb73a566dc36328b9f83b40"},
+    {file = "pydantic_core-2.14.6-cp312-none-win_arm64.whl", hash = "sha256:64634ccf9d671c6be242a664a33c4acf12882670b09b3f163cd00a24cffbd74e"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:24368e31be2c88bd69340fbfe741b405302993242ccb476c5c3ff48aeee1afe0"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e33b0834f1cf779aa839975f9d8755a7c2420510c0fa1e9fa0497de77cd35d2c"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6af4b3f52cc65f8a0bc8b1cd9676f8c21ef3e9132f21fed250f6958bd7223bed"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d15687d7d7f40333bd8266f3814c591c2e2cd263fa2116e314f60d82086e353a"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:095b707bb287bfd534044166ab767bec70a9bba3175dcdc3371782175c14e43c"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94fc0e6621e07d1e91c44e016cc0b189b48db053061cc22d6298a611de8071bb"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ce830e480f6774608dedfd4a90c42aac4a7af0a711f1b52f807130c2e434c06"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a306cdd2ad3a7d795d8e617a58c3a2ed0f76c8496fb7621b6cd514eb1532cae8"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2f5fa187bde8524b1e37ba894db13aadd64faa884657473b03a019f625cee9a8"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:438027a975cc213a47c5d70672e0d29776082155cfae540c4e225716586be75e"},
+    {file = "pydantic_core-2.14.6-cp37-none-win32.whl", hash = "sha256:f96ae96a060a8072ceff4cfde89d261837b4294a4f28b84a28765470d502ccc6"},
+    {file = "pydantic_core-2.14.6-cp37-none-win_amd64.whl", hash = "sha256:e646c0e282e960345314f42f2cea5e0b5f56938c093541ea6dbf11aec2862391"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:db453f2da3f59a348f514cfbfeb042393b68720787bbef2b4c6068ea362c8149"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3860c62057acd95cc84044e758e47b18dcd8871a328ebc8ccdefd18b0d26a21b"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36026d8f99c58d7044413e1b819a67ca0e0b8ebe0f25e775e6c3d1fabb3c38fb"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8ed1af8692bd8d2a29d702f1a2e6065416d76897d726e45a1775b1444f5928a7"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:314ccc4264ce7d854941231cf71b592e30d8d368a71e50197c905874feacc8a8"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:982487f8931067a32e72d40ab6b47b1628a9c5d344be7f1a4e668fb462d2da42"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dbe357bc4ddda078f79d2a36fc1dd0494a7f2fad83a0a684465b6f24b46fe80"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2f6ffc6701a0eb28648c845f4945a194dc7ab3c651f535b81793251e1185ac3d"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7f5025db12fc6de7bc1104d826d5aee1d172f9ba6ca936bf6474c2148ac336c1"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dab03ed811ed1c71d700ed08bde8431cf429bbe59e423394f0f4055f1ca0ea60"},
+    {file = "pydantic_core-2.14.6-cp38-none-win32.whl", hash = "sha256:dfcbebdb3c4b6f739a91769aea5ed615023f3c88cb70df812849aef634c25fbe"},
+    {file = "pydantic_core-2.14.6-cp38-none-win_amd64.whl", hash = "sha256:99b14dbea2fdb563d8b5a57c9badfcd72083f6006caf8e126b491519c7d64ca8"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:4ce8299b481bcb68e5c82002b96e411796b844d72b3e92a3fbedfe8e19813eab"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b9a9d92f10772d2a181b5ca339dee066ab7d1c9a34ae2421b2a52556e719756f"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd9e98b408384989ea4ab60206b8e100d8687da18b5c813c11e92fd8212a98e0"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4f86f1f318e56f5cbb282fe61eb84767aee743ebe32c7c0834690ebea50c0a6b"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86ce5fcfc3accf3a07a729779d0b86c5d0309a4764c897d86c11089be61da160"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dcf1978be02153c6a31692d4fbcc2a3f1db9da36039ead23173bc256ee3b91b"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eedf97be7bc3dbc8addcef4142f4b4164066df0c6f36397ae4aaed3eb187d8ab"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5f916acf8afbcab6bacbb376ba7dc61f845367901ecd5e328fc4d4aef2fcab0"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8a14c192c1d724c3acbfb3f10a958c55a2638391319ce8078cb36c02283959b9"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0348b1dc6b76041516e8a854ff95b21c55f5a411c3297d2ca52f5528e49d8411"},
+    {file = "pydantic_core-2.14.6-cp39-none-win32.whl", hash = "sha256:de2a0645a923ba57c5527497daf8ec5df69c6eadf869e9cd46e86349146e5975"},
+    {file = "pydantic_core-2.14.6-cp39-none-win_amd64.whl", hash = "sha256:aca48506a9c20f68ee61c87f2008f81f8ee99f8d7f0104bff3c47e2d148f89d9"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d5c28525c19f5bb1e09511669bb57353d22b94cf8b65f3a8d141c389a55dec95"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:78d0768ee59baa3de0f4adac9e3748b4b1fffc52143caebddfd5ea2961595277"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b93785eadaef932e4fe9c6e12ba67beb1b3f1e5495631419c784ab87e975670"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a874f21f87c485310944b2b2734cd6d318765bcbb7515eead33af9641816506e"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89f4477d915ea43b4ceea6756f63f0288941b6443a2b28c69004fe07fde0d0d"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:172de779e2a153d36ee690dbc49c6db568d7b33b18dc56b69a7514aecbcf380d"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dfcebb950aa7e667ec226a442722134539e77c575f6cfaa423f24371bb8d2e94"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:55a23dcd98c858c0db44fc5c04fc7ed81c4b4d33c653a7c45ddaebf6563a2f66"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4241204e4b36ab5ae466ecec5c4c16527a054c69f99bba20f6f75232a6a534e2"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e574de99d735b3fc8364cba9912c2bec2da78775eba95cbb225ef7dda6acea24"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1302a54f87b5cd8528e4d6d1bf2133b6aa7c6122ff8e9dc5220fbc1e07bffebd"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8e81e4b55930e5ffab4a68db1af431629cf2e4066dbdbfef65348b8ab804ea8"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c99462ffc538717b3e60151dfaf91125f637e801f5ab008f81c402f1dff0cd0f"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e4cf2d5829f6963a5483ec01578ee76d329eb5caf330ecd05b3edd697e7d768a"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cf10b7d58ae4a1f07fccbf4a0a956d705356fea05fb4c70608bb6fa81d103cda"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:399ac0891c284fa8eb998bcfa323f2234858f5d2efca3950ae58c8f88830f145"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c6a5c79b28003543db3ba67d1df336f253a87d3112dac3a51b94f7d48e4c0e1"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:599c87d79cab2a6a2a9df4aefe0455e61e7d2aeede2f8577c1b7c0aec643ee8e"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43e166ad47ba900f2542a80d83f9fc65fe99eb63ceec4debec160ae729824052"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a0b5db001b98e1c649dd55afa928e75aa4087e587b9524a4992316fa23c9fba"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:747265448cb57a9f37572a488a57d873fd96bf51e5bb7edb52cfb37124516da4"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7ebe3416785f65c28f4f9441e916bfc8a54179c8dea73c23023f7086fa601c5d"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:86c963186ca5e50d5c8287b1d1c9d3f8f024cbe343d048c5bd282aec2d8641f2"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e0641b506486f0b4cd1500a2a65740243e8670a2549bb02bc4556a83af84ae03"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71d72ca5eaaa8d38c8df16b7deb1a2da4f650c41b58bb142f3fb75d5ad4a611f"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27e524624eace5c59af499cd97dc18bb201dc6a7a2da24bfc66ef151c69a5f2a"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3dde6cac75e0b0902778978d3b1646ca9f438654395a362cb21d9ad34b24acf"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:00646784f6cd993b1e1c0e7b0fdcbccc375d539db95555477771c27555e3c556"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:23598acb8ccaa3d1d875ef3b35cb6376535095e9405d91a3d57a8c7db5d29341"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7f41533d7e3cf9520065f610b41ac1c76bc2161415955fbcead4981b22c7611e"},
+    {file = "pydantic_core-2.14.6.tar.gz", hash = "sha256:1fd0c1d395372843fba13a51c28e3bb9d59bd7aebfeb17358ffaaa1e4dbbe948"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"

 [[package]]
 name = "pytest"
@ -816,6 +915,7 @@ files = [
    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@ -823,8 +923,16 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@ -841,6 +949,7 @@ files = [
    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@ -848,6 +957,7 @@ files = [
    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@ -929,13 +1039,13 @@ files = [

 [[package]]
 name = "urllib3"
-version = "2.0.4"
+version = "2.0.5"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"},
-    {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"},
+    {file = "urllib3-2.0.5-py3-none-any.whl", hash = "sha256:ef16afa8ba34a1f989db38e1dbbe0c302e4289a47856990d0682e374563ce35e"},
+    {file = "urllib3-2.0.5.tar.gz", hash = "sha256:13abf37382ea2ce6fb744d4dad67838eec857c9f4f57009891805e0b5e123594"},
 ]

 [package.extras]
@ -1050,4 +1160,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.7"
-content-hash = "0db2f97d52c557dd7f90c55b4ad5bbe308c957c5f7f99fec53c57e0a13822cb4"
+content-hash = "b7fab8703967f2616ea59a98a437cd30f97f0c8d2a06e399d688814a2a2c64f8"
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.6.0"
+version = "0.7.0"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
@ -12,7 +12,7 @@ repository = "https://github.com/huggingface/text-generation-inference"

 [tool.poetry.dependencies]
 python = "^3.7"
-pydantic = "^1.10"
+pydantic = "> 2, < 3"
 aiohttp = "^3.8"
 huggingface-hub = ">= 0.12, < 1.0"

--- a/clients/python/tests/conftest.py
+++ b/clients/python/tests/conftest.py
@ -9,6 +9,11 @@ def flan_t5_xxl():
    return "google/flan-t5-xxl"


+@pytest.fixture
+def llama_7b():
+    return "meta-llama/Llama-2-7b-chat-hf"
+
+
@pytest.fixture
 def fake_model():
    return "fake/model"
@ -34,6 +39,11 @@ def flan_t5_xxl_url(base_url, flan_t5_xxl):
    return f"{base_url}/{flan_t5_xxl}"


+@pytest.fixture
+def llama_7b_url(base_url, llama_7b):
+    return f"{base_url}/{llama_7b}"
+
+
@pytest.fixture
 def fake_url(base_url, fake_model):
    return f"{base_url}/{fake_model}"
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@ -5,24 +5,24 @@ from text_generation.errors import NotFoundError, ValidationError
 from text_generation.types import FinishReason, InputToken


-def test_generate(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
    response = client.generate("test", max_new_tokens=1, decoder_input_details=True)

-    assert response.generated_text == ""
+    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None
-    assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == InputToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.prefill) == 2
+    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
    assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 3
-    assert response.details.tokens[0].text == " "
+    assert response.details.tokens[0].id == 29918
+    assert response.details.tokens[0].text == "_"
    assert not response.details.tokens[0].special


-def test_generate_best_of(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_best_of(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
    response = client.generate(
        "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
    )
@ -39,14 +39,14 @@ def test_generate_not_found(fake_url, hf_headers):
        client.generate("test")


-def test_generate_validation_error(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_validation_error(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        client.generate("test", max_new_tokens=10_000)


-def test_generate_stream(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_stream(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
    responses = [
        response for response in client.generate_stream("test", max_new_tokens=1)
    ]
@ -54,7 +54,7 @@ def test_generate_stream(flan_t5_xxl_url, hf_headers):
    assert len(responses) == 1
    response = responses[0]

-    assert response.generated_text == ""
+    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None
@ -66,34 +66,37 @@ def test_generate_stream_not_found(fake_url, hf_headers):
        list(client.generate_stream("test"))


-def test_generate_stream_validation_error(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_stream_validation_error(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        list(client.generate_stream("test", max_new_tokens=10_000))


@pytest.mark.asyncio
-async def test_generate_async(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
    response = await client.generate(
        "test", max_new_tokens=1, decoder_input_details=True
    )

-    assert response.generated_text == ""
+    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None
-    assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == InputToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.prefill) == 2
+    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
+    assert response.details.prefill[1] == InputToken(
+        id=1243, text="test", logprob=-10.96875
+    )
    assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 3
-    assert response.details.tokens[0].text == " "
+    assert response.details.tokens[0].id == 29918
+    assert response.details.tokens[0].text == "_"
    assert not response.details.tokens[0].special


@pytest.mark.asyncio
-async def test_generate_async_best_of(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async_best_of(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
    response = await client.generate(
        "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
    )
@ -112,15 +115,15 @@ async def test_generate_async_not_found(fake_url, hf_headers):


@pytest.mark.asyncio
-async def test_generate_async_validation_error(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async_validation_error(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        await client.generate("test", max_new_tokens=10_000)


@pytest.mark.asyncio
-async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_stream_async(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
    responses = [
        response async for response in client.generate_stream("test", max_new_tokens=1)
    ]
@ -128,7 +131,7 @@ async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
    assert len(responses) == 1
    response = responses[0]

-    assert response.generated_text == ""
+    assert response.generated_text == "_"
    assert response.details.finish_reason == FinishReason.Length
    assert response.details.generated_tokens == 1
    assert response.details.seed is None
@ -143,8 +146,8 @@ async def test_generate_stream_async_not_found(fake_url, hf_headers):


@pytest.mark.asyncio
-async def test_generate_stream_async_validation_error(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_stream_async_validation_error(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
    with pytest.raises(ValidationError):
        async for _ in client.generate_stream("test", max_new_tokens=10_000):
            pass
--- a/clients/python/text_generation/init.py
+++ b/clients/python/text_generation/init.py
@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = "0.3.0"
+__version__ = "0.7.0"
+
+DEPRECATION_WARNING = (
+    "`text_generation` clients are deprecated and will be removed in the near future. "
+    "Please use the `InferenceClient` from the `huggingface_hub` package instead."
+)

 from text_generation.client import Client, AsyncClient
 from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@ -1,18 +1,32 @@
 import json
 import requests
+import warnings

 from aiohttp import ClientSession, ClientTimeout
 from pydantic import ValidationError
-from typing import Dict, Optional, List, AsyncIterator, Iterator
+from typing import Dict, Optional, List, AsyncIterator, Iterator, Union

+from text_generation import DEPRECATION_WARNING
 from text_generation.types import (
    StreamResponse,
    Response,
    Request,
    Parameters,
+    Grammar,
+    CompletionRequest,
+    Completion,
+    CompletionComplete,
+    ChatRequest,
+    ChatCompletionChunk,
+    ChatComplete,
+    Message,
+    Tool,
 )
 from text_generation.errors import parse_error

+# emit deprecation warnings
+warnings.simplefilter("always", DeprecationWarning)
+

 class Client:
    """Client to make calls to a text-generation-inference instance
@ -53,11 +67,222 @@ class Client:
            timeout (`int`):
                Timeout in seconds
        """
+        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
        self.timeout = timeout

+    def completion(
+        self,
+        prompt: str,
+        frequency_penalty: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+    ):
+        """
+        Given a prompt, generate a response synchronously
+
+        Args:
+            prompt (`str`):
+                Prompt
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            seed (`int`):
+                Random sampling seed
+            stream (`bool`):
+                Stream the response
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = CompletionRequest(
+            model="tgi",
+            prompt=prompt,
+            frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+            stream=stream,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return Completion(**payload)
+        else:
+            return self._completion_stream_response(request)
+
+    def _completion_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = CompletionComplete(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+
+    def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_prompt: Optional[str] = None,
+        tool_choice: Optional[str] = None,
+        stop: Optional[List[str]] = None,
+    ):
+        """
+        Given a list of messages, generate a response asynchronously
+
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_prompt (`str`):
+                A prompt to be appended before the tools
+            tool_choice (`str`):
+                The tool to use
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_prompt=tool_prompt,
+            tool_choice=tool_choice,
+            stop=stop,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/chat/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return ChatComplete(**payload)
+        else:
+            return self._chat_stream_response(request)
+
+    def _chat_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = ChatCompletionChunk(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+
    def generate(
        self,
        prompt: str,
@ -65,6 +290,7 @@ class Client:
        max_new_tokens: int = 20,
        best_of: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -76,6 +302,7 @@ class Client:
        watermark: bool = False,
        decoder_input_details: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> Response:
        """
        Given a prompt, generate the following text
@ -92,6 +319,10 @@ class Client:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -116,6 +347,9 @@ class Client:
                Return the decoder input token logprobs and ids
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            Response: generated response
@ -127,6 +361,7 @@ class Client:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -137,7 +372,8 @@ class Client:
            typical_p=typical_p,
            watermark=watermark,
            decoder_input_details=decoder_input_details,
-            top_n_tokens=top_n_tokens
+            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=False, parameters=parameters)

@ -159,6 +395,7 @@ class Client:
        do_sample: bool = False,
        max_new_tokens: int = 20,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -169,6 +406,7 @@ class Client:
        typical_p: Optional[float] = None,
        watermark: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> Iterator[StreamResponse]:
        """
        Given a prompt, generate the following stream of tokens
@ -183,6 +421,10 @@ class Client:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -205,6 +447,9 @@ class Client:
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            Iterator[StreamResponse]: stream of generated tokens
@ -217,6 +462,7 @@ class Client:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -227,6 +473,7 @@ class Client:
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=True, parameters=parameters)

@ -303,10 +550,219 @@ class AsyncClient:
            timeout (`int`):
                Timeout in seconds
        """
+        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
-        self.timeout = ClientTimeout(timeout * 60)
+        self.timeout = ClientTimeout(timeout)
+
+    async def completion(
+        self,
+        prompt: str,
+        frequency_penalty: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[Completion, AsyncIterator[CompletionComplete]]:
+        """
+        Given a prompt, generate a response asynchronously
+
+        Args:
+            prompt (`str`):
+                Prompt
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            seed (`int`):
+                Random sampling seed
+            stream (`bool`):
+                Stream the response
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = CompletionRequest(
+            model="tgi",
+            prompt=prompt,
+            frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+            stream=stream,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+        if not stream:
+            return await self._completion_single_response(request)
+        else:
+            return self._completion_stream_response(request)
+
+    async def _completion_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return Completion(**payload)
+
+    async def _completion_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = CompletionComplete(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)
+
+    async def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_prompt: Optional[str] = None,
+        tool_choice: Optional[str] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
+        """
+        Given a list of messages, generate a response asynchronously
+
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_prompt (`str`):
+                A prompt to be appended before the tools
+            tool_choice (`str`):
+                The tool to use
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_prompt=tool_prompt,
+            tool_choice=tool_choice,
+            stop=stop,
+        )
+        if not stream:
+            return await self._chat_single_response(request)
+        else:
+            return self._chat_stream_response(request)
+
+    async def _chat_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return ChatComplete(**payload)
+
+    async def _chat_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = ChatCompletionChunk(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)

    async def generate(
        self,
@ -315,6 +771,7 @@ class AsyncClient:
        max_new_tokens: int = 20,
        best_of: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -326,6 +783,7 @@ class AsyncClient:
        watermark: bool = False,
        decoder_input_details: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> Response:
        """
        Given a prompt, generate the following text asynchronously
@ -342,6 +800,10 @@ class AsyncClient:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -366,10 +828,14 @@ class AsyncClient:
                Return the decoder input token logprobs and ids
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            Response: generated response
        """
+
        # Validate parameters
        parameters = Parameters(
            best_of=best_of,
@ -378,6 +844,7 @@ class AsyncClient:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -388,6 +855,7 @@ class AsyncClient:
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=False, parameters=parameters)

@ -407,6 +875,7 @@ class AsyncClient:
        do_sample: bool = False,
        max_new_tokens: int = 20,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -417,6 +886,7 @@ class AsyncClient:
        typical_p: Optional[float] = None,
        watermark: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> AsyncIterator[StreamResponse]:
        """
        Given a prompt, generate the following stream of tokens asynchronously
@ -431,6 +901,10 @@ class AsyncClient:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -453,6 +927,9 @@ class AsyncClient:
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            AsyncIterator[StreamResponse]: stream of generated tokens
@ -465,6 +942,7 @@ class AsyncClient:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -475,6 +953,7 @@ class AsyncClient:
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=True, parameters=parameters)

@ -482,7 +961,6 @@ class AsyncClient:
            headers=self.headers, cookies=self.cookies, timeout=self.timeout
        ) as session:
            async with session.post(self.base_url, json=request.dict()) as resp:
-
                if resp.status != 200:
                    raise parse_error(resp.status, await resp.json())

--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@ -1,10 +1,198 @@
 from enum import Enum
-from pydantic import BaseModel, validator
-from typing import Optional, List
+from pydantic import BaseModel, field_validator, ConfigDict
+from typing import Optional, List, Union, Any

 from text_generation.errors import ValidationError


+# enum for grammar type
+class GrammarType(str, Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar(BaseModel):
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
+
+class ToolCall(BaseModel):
+    # Id of the tool call
+    id: int
+    # Type of the tool call
+    type: str
+    # Function details of the tool call
+    function: dict
+
+
+class Message(BaseModel):
+    # Role of the message sender
+    role: str
+    # Content of the message
+    content: Optional[str] = None
+    # Optional name of the message sender
+    name: Optional[str] = None
+    # Tool calls associated with the chat completion
+    tool_calls: Optional[Any] = None
+
+
+class Tool(BaseModel):
+    # Type of the tool
+    type: str
+    # Function details of the tool
+    function: dict
+
+
+class Function(BaseModel):
+    name: Optional[str]
+    arguments: str
+
+
+class ChoiceDeltaToolCall(BaseModel):
+    index: int
+    id: str
+    type: str
+    function: Function
+
+
+class ChoiceDelta(BaseModel):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[ChoiceDeltaToolCall]
+
+
+class Choice(BaseModel):
+    index: int
+    delta: ChoiceDelta
+    logprobs: Optional[dict] = None
+    finish_reason: Optional[str] = None
+
+
+class CompletionRequest(BaseModel):
+    # Model identifier
+    model: str
+    # Prompt
+    prompt: str
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+
+
+class CompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    text: str
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+
+
+class Completion(BaseModel):
+    # Completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[CompletionComplete]
+
+
+class ChatRequest(BaseModel):
+    # Model identifier
+    model: str
+    # List of messages in the conversation
+    messages: List[Message]
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Bias values for token selection
+    logit_bias: Optional[List[float]] = None
+    # Whether to return log probabilities
+    logprobs: Optional[bool] = None
+    # Number of most likely tokens to return at each position
+    top_logprobs: Optional[int] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Number of chat completion choices to generate
+    n: Optional[int] = None
+    # Penalty for presence of new tokens
+    presence_penalty: Optional[float] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # List of tools to be used
+    tools: Optional[List[Tool]] = None
+    # A prompt to be appended before the tools
+    tool_prompt: Optional[str] = None
+    # Choice of tool to be used
+    tool_choice: Optional[str] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+
+
+class ChatCompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    message: Message
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+    # Usage details of the chat completion
+    usage: Optional[Any] = None
+
+
+class ChatComplete(BaseModel):
+    # Chat completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[ChatCompletionComplete]
+    usage: Any
+
+
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[Choice]
+
+
 class Parameters(BaseModel):
    # Activate logits sampling
    do_sample: bool = False
@ -13,26 +201,30 @@ class Parameters(BaseModel):
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
    # Whether to prepend the prompt to the generated text
    return_full_text: bool = False
    # Stop generating tokens if a member of `stop_sequences` is generated
    stop: List[str] = []
    # Random sampling seed
-    seed: Optional[int]
+    seed: Optional[int] = None
    # The value used to module the logits distribution.
-    temperature: Optional[float]
+    temperature: Optional[float] = None
    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-    top_k: Optional[int]
+    top_k: Optional[int] = None
    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
    # higher are kept for generation.
-    top_p: Optional[float]
+    top_p: Optional[float] = None
    # truncate inputs tokens to the given size
-    truncate: Optional[int]
+    truncate: Optional[int] = None
    # Typical Decoding mass
    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
-    typical_p: Optional[float]
+    typical_p: Optional[float] = None
    # Generate best_of sequences and return the one if the highest token logprobs
-    best_of: Optional[int]
+    best_of: Optional[int] = None
    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
    watermark: bool = False
    # Get generation details
@ -40,100 +232,119 @@ class Parameters(BaseModel):
    # Get decoder input token logprobs and ids
    decoder_input_details: bool = False
    # Return the N most likely tokens at each step
-    top_n_tokens: Optional[int]
+    top_n_tokens: Optional[int] = None
+    # grammar to use for generation
+    grammar: Optional[Grammar] = None

-    @validator("best_of")
+    @field_validator("best_of")
    def valid_best_of(cls, field_value, values):
        if field_value is not None:
            if field_value <= 0:
                raise ValidationError("`best_of` must be strictly positive")
-            if field_value > 1 and values["seed"] is not None:
+            if field_value > 1 and values.data["seed"] is not None:
                raise ValidationError("`seed` must not be set when `best_of` is > 1")
            sampling = (
-                values["do_sample"]
-                | (values["temperature"] is not None)
-                | (values["top_k"] is not None)
-                | (values["top_p"] is not None)
-                | (values["typical_p"] is not None)
+                values.data["do_sample"]
+                | (values.data["temperature"] is not None)
+                | (values.data["top_k"] is not None)
+                | (values.data["top_p"] is not None)
+                | (values.data["typical_p"] is not None)
            )
            if field_value > 1 and not sampling:
                raise ValidationError("you must use sampling when `best_of` is > 1")

        return field_value

-    @validator("repetition_penalty")
+    @field_validator("repetition_penalty")
    def valid_repetition_penalty(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`repetition_penalty` must be strictly positive")
        return v

-    @validator("seed")
+    @field_validator("frequency_penalty")
+    def valid_frequency_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`frequency_penalty` must be strictly positive")
+        return v
+
+    @field_validator("seed")
    def valid_seed(cls, v):
        if v is not None and v < 0:
            raise ValidationError("`seed` must be positive")
        return v

-    @validator("temperature")
+    @field_validator("temperature")
    def valid_temp(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`temperature` must be strictly positive")
        return v

-    @validator("top_k")
+    @field_validator("top_k")
    def valid_top_k(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`top_k` must be strictly positive")
        return v

-    @validator("top_p")
+    @field_validator("top_p")
    def valid_top_p(cls, v):
        if v is not None and (v <= 0 or v >= 1.0):
            raise ValidationError("`top_p` must be > 0.0 and < 1.0")
        return v

-    @validator("truncate")
+    @field_validator("truncate")
    def valid_truncate(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`truncate` must be strictly positive")
        return v

-    @validator("typical_p")
+    @field_validator("typical_p")
    def valid_typical_p(cls, v):
        if v is not None and (v <= 0 or v >= 1.0):
            raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
        return v

-    @validator("top_n_tokens")
+    @field_validator("top_n_tokens")
    def valid_top_n_tokens(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`top_n_tokens` must be strictly positive")
        return v

+    @field_validator("grammar")
+    def valid_grammar(cls, v):
+        if v is not None:
+            if v.type == GrammarType.Regex and not v.value:
+                raise ValidationError("`value` cannot be empty for `regex` grammar")
+            if v.type == GrammarType.Json and not v.value:
+                raise ValidationError("`value` cannot be empty for `json` grammar")
+        return v
+

 class Request(BaseModel):
    # Prompt
    inputs: str
    # Generation parameters
-    parameters: Optional[Parameters]
+    parameters: Optional[Parameters] = None
    # Whether to stream output tokens
    stream: bool = False

-    @validator("inputs")
+    @field_validator("inputs")
    def valid_input(cls, v):
        if not v:
            raise ValidationError("`inputs` cannot be empty")
        return v

-    @validator("stream")
+    @field_validator("stream")
    def valid_best_of_stream(cls, field_value, values):
-        parameters = values["parameters"]
+        parameters = values.data["parameters"]
        if (
            parameters is not None
            and parameters.best_of is not None
            and parameters.best_of > 1
            and field_value
        ):
-            raise ValidationError("`best_of` != 1 is not supported when `stream` == True")
+            raise ValidationError(
+                "`best_of` != 1 is not supported when `stream` == True"
+            )
        return field_value


@ -145,7 +356,7 @@ class InputToken(BaseModel):
    text: str
    # Logprob
    # Optional since the logprob of the first token cannot be computed
-    logprob: Optional[float]
+    logprob: Optional[float] = None


 # Generated tokens
@ -155,7 +366,7 @@ class Token(BaseModel):
    # Token text
    text: str
    # Logprob
-    logprob: float
+    logprob: Optional[float] = None
    # Is the token a special token
    # Can be used to ignore tokens when concatenating
    special: bool
@ -180,13 +391,13 @@ class BestOfSequence(BaseModel):
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
-    seed: Optional[int]
+    seed: Optional[int] = None
    # Decoder input tokens, empty if decoder_input_details is False
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None


 # `generate` details
@ -196,15 +407,15 @@ class Details(BaseModel):
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
-    seed: Optional[int]
+    seed: Optional[int] = None
    # Decoder input tokens, empty if decoder_input_details is False
    prefill: List[InputToken]
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
    # Additional sequences when using the `best_of` parameter
-    best_of_sequences: Optional[List[BestOfSequence]]
+    best_of_sequences: Optional[List[BestOfSequence]] = None


 # `generate` return value
@ -222,7 +433,7 @@ class StreamDetails(BaseModel):
    # Number of generated tokens
    generated_tokens: int
    # Sampling seed if sampling was activated
-    seed: Optional[int]
+    seed: Optional[int] = None


 # `generate_stream` return value
@ -230,16 +441,20 @@ class StreamResponse(BaseModel):
    # Generated token
    token: Token
    # Most likely tokens
-    top_tokens: Optional[List[Token]]
+    top_tokens: Optional[List[Token]] = None
    # Complete generated text
    # Only available when the generation is finished
-    generated_text: Optional[str]
+    generated_text: Optional[str] = None
    # Generation details
    # Only available when the generation is finished
-    details: Optional[StreamDetails]
+    details: Optional[StreamDetails] = None


 # Inference API currently deployed model
 class DeployedModel(BaseModel):
+    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
+    # with model_ prefixes, since this disables guardrails for colliding fields:
+    # https://github.com/pydantic/pydantic/issues/9177
+    model_config = ConfigDict(protected_namespaces=())
    model_id: str
    sha: str
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,10 @@
+Documentation available at: https://huggingface.co/docs/text-generation-inference
+
+## Release
+
+When making a release, please update the latest version in the documentation with:
+```
+export OLD_VERSION="2\.0\.3"
+export NEW_VERSION="2\.0\.4"
+find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
+```
--- a/docs/index.html
+++ b/docs/index.html
@ -27,4 +27,4 @@
            }
        </script>
    </body>
-</html>
+</html>
--- a/docs/openapi.json
+++ b/docs/openapi.json
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -3,10 +3,22 @@
    title: Text Generation Inference
  - local: quicktour
    title: Quick Tour
+  - local: installation_nvidia
+    title: Using TGI with Nvidia GPUs
+  - local: installation_amd
+    title: Using TGI with AMD GPUs
+  - local: installation_gaudi
+    title: Using TGI with Intel Gaudi
+  - local: installation_inferentia
+    title: Using TGI with AWS Inferentia
  - local: installation
-    title: Installation
+    title: Installation from source
  - local: supported_models
    title: Supported Models and Hardware
+  - local: messages_api
+    title: Messages API
+  - local: architecture
+    title: Internal Architecture
  title: Getting started
 - sections:
  - local: basic_tutorials/consuming_tgi
@ -17,8 +29,40 @@
    title: Serving Private & Gated Models
  - local: basic_tutorials/using_cli
    title: Using TGI CLI
+  - local: basic_tutorials/launcher
+    title: All TGI CLI options
+  - local: basic_tutorials/non_core_models
+    title: Non-core Model Serving
+  - local: basic_tutorials/safety
+    title: Safety
+  - local: basic_tutorials/using_guidance
+    title: Using Guidance, JSON, tools
+  - local: basic_tutorials/visual_language_models
+    title: Visual Language Models
+  - local: basic_tutorials/monitoring
+    title: Monitoring TGI with Prometheus and Grafana
+  - local: basic_tutorials/train_medusa
+    title: Train Medusa
  title: Tutorials
 - sections:
  - local: conceptual/streaming
    title: Streaming
+  - local: conceptual/quantization
+    title: Quantization
+  - local: conceptual/tensor_parallelism
+    title: Tensor Parallelism
+  - local: conceptual/paged_attention
+    title: PagedAttention
+  - local: conceptual/safetensors
+    title: Safetensors
+  - local: conceptual/flash_attention
+    title: Flash Attention
+  - local: conceptual/speculation
+    title: Speculation (Medusa, ngram)
+  - local: conceptual/guidance
+    title: How Guidance Works (via outlines
+  - local: conceptual/lora
+    title: LoRA (Low-Rank Adaptation)
+
+
  title: Conceptual Guides
--- a/docs/source/architecture.md
+++ b/docs/source/architecture.md
@ -0,0 +1,231 @@
+# Text Generation Inference Architecture
+
+This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.
+
+A high-level architecture diagram can be seen here:
+
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
+
+This diagram shows well there are these separate components:
+
+- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
+- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
+- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
+
+The router and the model server can be two different machines, they do not need to be deployed together.
+
+## The Router
+
+This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api).
+The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)).
+It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.
+
+### Router's command line
+
+The router command line will be the way to pass parameters to it (it does not rely on configuration file):
+
+```
+Text Generation Webserver
+
+Usage: text-generation-router [OPTIONS]
+
+Options:
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          [env: MAX_CONCURRENT_REQUESTS=] [default: 128]
+      --max-best-of <MAX_BEST_OF>
+          [env: MAX_BEST_OF=] [default: 2]
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          [env: MAX_STOP_SEQUENCES=] [default: 4]
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          [env: MAX_TOP_N_TOKENS=] [default: 5]
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          [env: MAX_INPUT_TOKENS=] [default: 1024]
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          [env: MAX_TOTAL_TOKENS=] [default: 2048]
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          [env: WAITING_SERVED_RATIO=] [default: 1.2]
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          [env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096]
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          [env: MAX_WAITING_TOKENS=] [default: 20]
+      --max-batch-size <MAX_BATCH_SIZE>
+          [env: MAX_BATCH_SIZE=]
+      --hostname <HOSTNAME>
+          [env: HOSTNAME=] [default: 0.0.0.0]
+  -p, --port <PORT>
+          [env: PORT=] [default: 3000]
+      --master-shard-uds-path <MASTER_SHARD_UDS_PATH>
+          [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]
+      --tokenizer-name <TOKENIZER_NAME>
+          [env: TOKENIZER_NAME=] [default: bigscience/bloom]
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          [env: TOKENIZER_CONFIG_PATH=]
+      --revision <REVISION>
+          [env: REVISION=]
+      --validation-workers <VALIDATION_WORKERS>
+          [env: VALIDATION_WORKERS=] [default: 2]
+      --json-output
+          [env: JSON_OUTPUT=]
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+      --ngrok
+          [env: NGROK=]
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          [env: NGROK_AUTHTOKEN=]
+      --ngrok-edge <NGROK_EDGE>
+          [env: NGROK_EDGE=]
+      --messages-api-enabled
+          [env: MESSAGES_API_ENABLED=]
+      --disable-grammar-support
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          [env: MAX_CLIENT_BATCH_SIZE=] [default: 4]
+  -h, --help
+          Print help
+  -V, --version
+          Print version
+```
+
+## The Model Server
+
+The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests.
+The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.
+
+### Model Server Variants
+
+Several variants of the model server exist that are actively supported by Hugging Face:
+
+- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
+- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
+- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
+- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
+- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
+
+Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
+
+### Command Line Interface
+
+The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`:
+
+- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;
+- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants;
+- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.
+
+Serve's command line parameters on the TGI repository are these:
+
+```
+ Usage: cli.py serve [OPTIONS] MODEL_ID
+
+╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    model_id      TEXT  [default: None] [required]                                                      │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --revision                                       TEXT                        [default: None]             │
+│ --sharded              --no-sharded                                          [default: no-sharded]       │
+│ --quantize                                       [bitsandbytes|bitsandbytes  [default: None]             │
+│                                                  -nf4|bitsandbytes-fp4|gptq                              │
+│                                                  |awq|eetq|exl2|fp8]                                     │
+│ --speculate                                      INTEGER                     [default: None]             │
+│ --dtype                                          [float16|bfloat16]          [default: None]             │
+│ --trust-remote-code    --no-trust-remote-code                                [default:                   │
+│                                                                              no-trust-remote-code]       │
+│ --uds-path                                       PATH                        [default:                   │
+│                                                                              /tmp/text-generation-serve… │
+│ --logger-level                                   TEXT                        [default: INFO]             │
+│ --json-output          --no-json-output                                      [default: no-json-output]   │
+│ --otlp-endpoint                                  TEXT                        [default: None]             │
+│ --otlp-service-name                              TEXT                        [default:                   │
+│                                                                              text-generation-inference...│
+│ --help                                                                       Show this message and exit. │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.
+
+## Call Flow
+
+Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for:
+
+- input chunks support, for text and image data,
+- paged attention support
+
+Here's a diagram that displays the exchanges that follow the router and model server startup.
+
+```mermaid
+sequenceDiagram
+
+    Router->>Model Server: service discovery
+    Model Server-->>Router: urls for other shards
+
+    Router->>Model Server: get model info
+    Model Server-->>Router: shard info
+
+    Router->>Model Server: health check
+    Model Server-->>Router: health OK
+
+    Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
+    Model Server-->>Router: warmup result
+```
+
+After these are done, the router is ready to receive generate calls from multiple clients. Here's an example.
+
+```mermaid
+sequenceDiagram
+    participant Client 1
+    participant Client 2
+    participant Client 3
+    participant Router
+    participant Model Server
+
+    Client 1->>Router: generate_stream
+    Router->>Model Server: prefill(batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 1
+
+    Router->>Model Server: decode(cached_batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 2
+
+    Router->>Model Server: decode(cached_batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 3
+
+    Client 2->>Router: generate_stream
+    Router->>Model Server: prefill(batch2)
+    Note right of Model Server: This stops previous batch, that is restarted
+    Model Server-->>Router: generations, cached_batch2, timings
+    Router-->>Client 2: token 1'
+
+    Router->>Model Server: decode(cached_batch1, cached_batch2)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 4
+    Router-->>Client 2: token 2'
+
+    Note left of Client 1: Client 1 leaves
+    Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
+    Model Server-->>Router: filtered batch
+
+    Router->>Model Server: decode(cached_batch2)
+    Model Server-->>Router: generations, cached_batch2, timings
+    Router-->>Client 2: token 3'
+
+    Client 3->>Router: generate_stream
+    Note right of Model Server: This stops previous batch, that is restarted
+    Router->>Model Server: prefill(batch3)
+    Note left of Client 1: Client 3 leaves without receiving any batch
+    Router->>Model Server: clear_cache(batch3)
+    Note right of Model Server: This stops previous batch, that is restarted
+
+    Router->>Model Server: decode(cached_batch3)
+    Note right of Model Server: Last token (stopping criteria)
+    Model Server-->>Router: generations, cached_batch3, timings
+    Router-->>Client 2: token 4'
+
+
+```
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@ -23,7 +23,7 @@ You can simply install `huggingface-hub` package with pip.
 pip install huggingface-hub
 ```

-Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. 
+Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.

 ```python
 from huggingface_hub import InferenceClient
@ -83,8 +83,8 @@ Gradio is a Python library that helps you build web applications for your machin
 pip install huggingface-hub gradio
 ```

-Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). 
- 
+Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
+
 ```python
 import gradio as gr
 from huggingface_hub import InferenceClient
@ -110,30 +110,30 @@ gr.ChatInterface(
 ).queue().launch()
 ```

-The UI looks like this 👇 
+The UI looks like this 👇

 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
    />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
    />
 </div>

-You can try the demo directly here 👇 
+You can try the demo directly here 👇

 <div class="block dark:hidden">
-	<iframe 
+	<iframe
        src="https://merve-gradio-tgi-2.hf.space?__theme=light"
        width="850"
        height="750"
    ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
        src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
        width="850"
        height="750"
@ -152,4 +152,4 @@ You can read more about how to customize a `ChatInterface` [here](https://www.gr

 ## API documentation

-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference). 
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -2,13 +2,13 @@

 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)

-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:

 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```

-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.

 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@ -17,8 +17,8 @@ token=<your READ token>

 docker run --gpus all \
    --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
    --model-id $model
 ```
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@ -0,0 +1,439 @@
+# Text-generation-launcher arguments
+
+<!-- WRAP CODE BLOCKS -->
+
+```shell
+Text Generation Launcher
+
+Usage: text-generation-launcher [OPTIONS]
+
+Options:
+```
+## MODEL_ID
+```shell
+      --model-id <MODEL_ID>
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          
+          [env: MODEL_ID=]
+          [default: bigscience/bloom-560m]
+
+```
+## REVISION
+```shell
+      --revision <REVISION>
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+          
+          [env: REVISION=]
+
+```
+## VALIDATION_WORKERS
+```shell
+      --validation-workers <VALIDATION_WORKERS>
+          The number of tokenizer workers used for payload validation and truncation inside the router
+          
+          [env: VALIDATION_WORKERS=]
+          [default: 2]
+
+```
+## SHARDED
+```shell
+      --sharded <SHARDED>
+          Whether to shard the model across multiple GPUs By default text-generation-inference will use all available GPUs to run the model. Setting it to `false` deactivates `num_shard`
+          
+          [env: SHARDED=]
+          [possible values: true, false]
+
+```
+## NUM_SHARD
+```shell
+      --num-shard <NUM_SHARD>
+          The number of shards to use if you don't want to use all GPUs on a given machine. You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2` and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance
+          
+          [env: NUM_SHARD=]
+
+```
+## QUANTIZE
+```shell
+      --quantize <QUANTIZE>
+          Whether you want the model to be quantized
+          
+          [env: QUANTIZE=]
+
+          Possible values:
+          - awq:              4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+          - exl2:             Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - marlin:           4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
+          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
+          - fp8:              [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
+
+```
+## SPECULATE
+```shell
+      --speculate <SPECULATE>
+          The number of input_ids to speculate on If using a medusa model, the heads will be picked up automatically Other wise, it will use n-gram speculation which is relatively free in terms of compute, but the speedup heavily depends on the task
+          
+          [env: SPECULATE=]
+
+```
+## DTYPE
+```shell
+      --dtype <DTYPE>
+          The dtype to be forced upon the model. This option cannot be used with `--quantize`
+          
+          [env: DTYPE=]
+          [possible values: float16, bfloat16]
+
+```
+## TRUST_REMOTE_CODE
+```shell
+      --trust-remote-code
+          Whether you want to execute hub modelling code. Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision
+          
+          [env: TRUST_REMOTE_CODE=]
+
+```
+## MAX_CONCURRENT_REQUESTS
+```shell
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+          
+          [env: MAX_CONCURRENT_REQUESTS=]
+          [default: 128]
+
+```
+## MAX_BEST_OF
+```shell
+      --max-best-of <MAX_BEST_OF>
+          This is the maximum allowed value for clients to set `best_of`. Best of makes `n` generations at the same time, and return the best in terms of overall log probability over the entire generated sequence
+          
+          [env: MAX_BEST_OF=]
+          [default: 2]
+
+```
+## MAX_STOP_SEQUENCES
+```shell
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          This is the maximum allowed value for clients to set `stop_sequences`. Stop sequences are used to allow the model to stop on more than just the EOS token, and enable more complex "prompting" where users can preprompt the model in a specific way and define their "own" stop token aligned with their prompt
+          
+          [env: MAX_STOP_SEQUENCES=]
+          [default: 4]
+
+```
+## MAX_TOP_N_TOKENS
+```shell
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
+          
+          [env: MAX_TOP_N_TOKENS=]
+          [default: 5]
+
+```
+## MAX_INPUT_TOKENS
+```shell
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
+          
+          [env: MAX_INPUT_TOKENS=]
+
+```
+## MAX_INPUT_LENGTH
+```shell
+      --max-input-length <MAX_INPUT_LENGTH>
+          Legacy version of [`Args::max_input_tokens`]
+          
+          [env: MAX_INPUT_LENGTH=]
+
+```
+## MAX_TOTAL_TOKENS
+```shell
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
+          
+          [env: MAX_TOTAL_TOKENS=]
+
+```
+## WAITING_SERVED_RATIO
+```shell
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          This represents the ratio of waiting queries vs running queries where you want to start considering pausing the running queries to include the waiting ones into the same batch. `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's only 10 queries left in the current batch we check if we can fit those 12 waiting queries into the batching strategy, and if yes, then batching happens delaying the 10 running queries by a `prefill` run.
+          
+          This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.
+          
+          [env: WAITING_SERVED_RATIO=]
+          [default: 0.3]
+
+```
+## MAX_BATCH_PREFILL_TOKENS
+```shell
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to `max_input_tokens + 50` to give a bit of room
+          
+          [env: MAX_BATCH_PREFILL_TOKENS=]
+
+```
+## MAX_BATCH_TOTAL_TOKENS
+```shell
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
+          
+          This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
+          
+          However in the non-padded (flash attention) version this can be much finer.
+          
+          For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
+          
+          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference cannot infer this number automatically.
+          
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+
+```
+## MAX_WAITING_TOKENS
+```shell
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          This setting defines how many tokens can be passed before forcing the waiting queries to be put on the batch (if the size of the batch allows for it). New queries require 1 `prefill` forward, which is different from `decode` and therefore you need to pause the running batch in order to run `prefill` to create the correct values for the waiting queries to be able to join the batch.
+          
+          With a value too small, queries will always "steal" the compute to run `prefill` and running queries will be delayed by a lot.
+          
+          With a value too big, waiting queries could wait for a very long time before being allowed a slot in the running batch. If your server is busy that means that requests that could run in ~2s on an empty server could end up running in ~20s because the query had to wait for 18s.
+          
+          This number is expressed in number of tokens to make it a bit more "model" agnostic, but what should really matter is the overall latency for end users.
+          
+          [env: MAX_WAITING_TOKENS=]
+          [default: 20]
+
+```
+## MAX_BATCH_SIZE
+```shell
+      --max-batch-size <MAX_BATCH_SIZE>
+          Enforce a maximum number of requests per batch Specific flag for hardware targets that do not support unpadded inference
+          
+          [env: MAX_BATCH_SIZE=]
+
+```
+## CUDA_GRAPHS
+```shell
+      --cuda-graphs <CUDA_GRAPHS>
+          Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
+          
+          [env: CUDA_GRAPHS=]
+
+```
+## HOSTNAME
+```shell
+      --hostname <HOSTNAME>
+          The IP address to listen on
+          
+          [env: HOSTNAME=]
+          [default: 0.0.0.0]
+
+```
+## PORT
+```shell
+  -p, --port <PORT>
+          The port to listen on
+          
+          [env: PORT=]
+          [default: 3000]
+
+```
+## SHARD_UDS_PATH
+```shell
+      --shard-uds-path <SHARD_UDS_PATH>
+          The name of the socket for gRPC communication between the webserver and the shards
+          
+          [env: SHARD_UDS_PATH=]
+          [default: /tmp/text-generation-server]
+
+```
+## MASTER_ADDR
+```shell
+      --master-addr <MASTER_ADDR>
+          The address the master shard will listen on. (setting used by torch distributed)
+          
+          [env: MASTER_ADDR=]
+          [default: localhost]
+
+```
+## MASTER_PORT
+```shell
+      --master-port <MASTER_PORT>
+          The address the master port will listen on. (setting used by torch distributed)
+          
+          [env: MASTER_PORT=]
+          [default: 29500]
+
+```
+## HUGGINGFACE_HUB_CACHE
+```shell
+      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          
+          [env: HUGGINGFACE_HUB_CACHE=]
+
+```
+## WEIGHTS_CACHE_OVERRIDE
+```shell
+      --weights-cache-override <WEIGHTS_CACHE_OVERRIDE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          
+          [env: WEIGHTS_CACHE_OVERRIDE=]
+
+```
+## DISABLE_CUSTOM_KERNELS
+```shell
+      --disable-custom-kernels
+          For some models (like bloom), text-generation-inference implemented custom cuda kernels to speed up inference. Those kernels were only tested on A100. Use this flag to disable them if you're running on different hardware and encounter issues
+          
+          [env: DISABLE_CUSTOM_KERNELS=]
+
+```
+## CUDA_MEMORY_FRACTION
+```shell
+      --cuda-memory-fraction <CUDA_MEMORY_FRACTION>
+          Limit the CUDA available memory. The allowed value equals the total visible memory multiplied by cuda-memory-fraction
+          
+          [env: CUDA_MEMORY_FRACTION=]
+          [default: 1.0]
+
+```
+## ROPE_SCALING
+```shell
+      --rope-scaling <ROPE_SCALING>
+          Rope scaling will only be used for RoPE models and allow rescaling the position rotary to accomodate for larger prompts.
+          
+          Goes together with `rope_factor`.
+          
+          `--rope-factor 2.0` gives linear scaling with a factor of 2.0 `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0 `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed basically)
+          
+          `--rope-scaling linear --rope-factor` fully describes the scaling you want
+          
+          [env: ROPE_SCALING=]
+          [possible values: linear, dynamic]
+
+```
+## ROPE_FACTOR
+```shell
+      --rope-factor <ROPE_FACTOR>
+          Rope scaling will only be used for RoPE models See `rope_scaling`
+          
+          [env: ROPE_FACTOR=]
+
+```
+## JSON_OUTPUT
+```shell
+      --json-output
+          Outputs the logs in JSON format (useful for telemetry)
+          
+          [env: JSON_OUTPUT=]
+
+```
+## OTLP_ENDPOINT
+```shell
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+
+```
+## OTLP_SERVICE_NAME
+```shell
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+          [default: text-generation-inference.router]
+
+```
+## CORS_ALLOW_ORIGIN
+```shell
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+
+```
+## WATERMARK_GAMMA
+```shell
+      --watermark-gamma <WATERMARK_GAMMA>
+          [env: WATERMARK_GAMMA=]
+
+```
+## WATERMARK_DELTA
+```shell
+      --watermark-delta <WATERMARK_DELTA>
+          [env: WATERMARK_DELTA=]
+
+```
+## NGROK
+```shell
+      --ngrok
+          Enable ngrok tunneling
+          
+          [env: NGROK=]
+
+```
+## NGROK_AUTHTOKEN
+```shell
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          ngrok authentication token
+          
+          [env: NGROK_AUTHTOKEN=]
+
+```
+## NGROK_EDGE
+```shell
+      --ngrok-edge <NGROK_EDGE>
+          ngrok edge
+          
+          [env: NGROK_EDGE=]
+
+```
+## TOKENIZER_CONFIG_PATH
+```shell
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          The path to the tokenizer config file. This path is used to load the tokenizer configuration which may include a `chat_template`. If not provided, the default config will be used from the model hub
+          
+          [env: TOKENIZER_CONFIG_PATH=]
+
+```
+## DISABLE_GRAMMAR_SUPPORT
+```shell
+      --disable-grammar-support
+          Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
+          
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+
+```
+## ENV
+```shell
+  -e, --env
+          Display a lot of information about your runtime environment
+
+```
+## MAX_CLIENT_BATCH_SIZE
+```shell
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send in a single request
+          
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 4]
+
+```
+## LORA_ADAPTERS
+```shell
+      --lora-adapters <LORA_ADAPTERS>
+          Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
+          
+          [env: LORA_ADAPTERS=]
+
+```
+## HELP
+```shell
+  -h, --help
+          Print help (see a summary with '-h')
+
+```
+## VERSION
+```shell
+  -V, --version
+          Print version
+
+```
--- a/docs/source/basic_tutorials/monitoring.md
+++ b/docs/source/basic_tutorials/monitoring.md
@ -0,0 +1,75 @@
+# Monitoring TGI server with Prometheus and Grafana dashboard
+
+TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
+
+In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
+
+![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)
+
+## Setup on the server machine
+
+First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
+
+In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
+
+On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
+
+```
+wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
+tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
+cd prometheus
+```
+
+Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
+```
+    static_configs:
+      - targets: ["0.0.0.0:80"]
+```
+to use the correct IP address and port.
+
+We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
+
+Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
+```
+./prometheus --config.file="prometheus.yml"
+```
+
+In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
+* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
+* Use ngrok port tunneling
+
+For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
+
+For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
+```bash
+ngrok http http://0.0.0.0:9090
+```
+
+As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
+
+## Setup on the monitoring machine
+
+Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
+
+Two options are available:
+* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
+* Self-host a grafana dashboard.
+
+In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
+
+```bash
+wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
+tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
+cd grafana-11.0.0
+./bin/grafana-server
+```
+
+Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
+
+Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
+
+Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
+
+Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
+
+Load your dashboard configuration, and your TGI dashboard should be ready to go!
--- a/docs/source/basic_tutorials/non_core_models.md
+++ b/docs/source/basic_tutorials/non_core_models.md
@ -0,0 +1,24 @@
+# Non-core Model Serving
+
+TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
+
+You can serve these models using the same Docker command-line invocation as with fully supported models 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇
+
+```bash
+# Make sure your model is in the $volume directory
+docker run --shm-size 1g -p 8080:80 -v $volume:/data  ghcr.io/huggingface/text-generation-inference:latest --model-id /data/<PATH-TO-FOLDER>
+```
+
+You can refer to [transformers docs on custom models](https://huggingface.co/docs/transformers/main/en/custom_models) for more information.
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@ -1,15 +1,15 @@
 # Preparing the Model

-Text Generation Inference improves the model in several aspects. 
+Text Generation Inference improves the model in several aspects.

 ## Quantization

-TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes` or `gptq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq).
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)


 ## RoPE Scaling

-RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension. 
+RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.

 <Tip>

@ -19,4 +19,4 @@ We recommend using `dynamic` RoPE scaling.

 ## Safetensors

-[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format. 
+[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.
--- a/docs/source/basic_tutorials/safety.md
+++ b/docs/source/basic_tutorials/safety.md
@ -0,0 +1,31 @@
+# Model safety.
+
+[Pytorch uses pickle](https://pytorch.org/docs/master/generated/torch.load.html) by default meaning that for quite a long while
+*Every* model using that format is potentially executing unintended code while purely loading the model.
+
+There is a big red warning on Python's page for pickle [link](https://docs.python.org/3/library/pickle.html) but for quite a while
+this was ignored by the community. Now that AI/ML is getting used much more ubiquitously we need to switch away from this format.
+
+HuggingFace is leading the effort here by creating a new format which contains pure data ([safetensors](https://github.com/huggingface/safetensors))
+and moving slowly but surely all the libs to make use of it by default.
+The move is intentionnally slow in order to make breaking changes as little impact as possible on users throughout.
+
+
+# TGI 2.0
+
+Since the release of TGI 2.0, we take the opportunity of this major version increase to break backward compatibility for these pytorch
+models (since they are a huge security risk for anyone deploying them).
+
+
+From now on, TGI will not convert automatically pickle files without having `--trust-remote-code` flag or `TRUST_REMOTE_CODE=true` in the environment variables.
+This flag is already used for community defined inference code, and is therefore quite representative of the level of confidence you are giving the model providers.
+
+
+If you want to use a model that uses pickle, but you still do not want to trust the authors entirely we recommend making a convertion on our space made for that.
+
+https://huggingface.co/spaces/safetensors/convert
+
+This space will create a PR on the original model, which you are use directly regardless of merge status from the original authors. Just use
+```
+docker run .... --revision refs/pr/#ID # Or use REVISION=refs/pr/#ID in the environment
+```
--- a/docs/source/basic_tutorials/train_medusa.md
+++ b/docs/source/basic_tutorials/train_medusa.md
@ -0,0 +1,208 @@
+# Train Medusa
+
+This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation) for more information on how Medusa works and speculation in general.
+
+## What are the benefits of training a Medusa model?
+
+Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.
+
+One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.
+
+If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.
+
+## Self-distillation (Generating data for training)
+
+There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.
+
+Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.
+
+We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.
+
+## Training
+
+The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.
+
+### Getting Started
+
+There are two methods for training the model:
+
+- `torchrun` that is a wrapper around `torch.distributed.launch`
+- a forked version of `axlotl` that supports Medusa
+
+In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.
+
+### Training with `torchrun`
+
+```bash
+mkdir medusa-training
+cd medusa-training
+
+pyenv install 3.10
+pyenv local 3.10
+
+uv venv -p 3.10
+source .venv/bin/activate
+```
+
+Now lets clone the original `Medusa` repository and install the library.
+
+```bash
+git clone https://github.com/FasterDecoding/Medusa.git
+cd Medusa
+pip install -e .
+```
+
+Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.
+
+```bash
+apt install git-lfs
+git lfs install
+git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
+```
+
+Currently our directory structure looks like this:
+
+```bash
+.
+├── assets
+├── CITATION.cff
+├── create_data.py
+├── data_generation
+├── deepspeed.json
+├── last_run_prepared
+├── LICENSE
+├── llm_judge
+├── medusa
+├── medusa_llm.egg-info
+├── mistral.json
+├── notebooks
+├── pyproject.toml
+├── README.md
+├── ROADMAP.md
+├── scripts
+├── ShareGPT_Vicuna_unfiltered
+│   ├── README.md
+│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
+│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json
+├── simple_gradio_interface.py
+├── tiny-llama.json
+└── vicuna_7b_qlora_stage1
+```
+
+## Start Training
+
+Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.
+
+First make sure you have an instance of TGI running with the model you want to use for self-distillation.
+
+```bash
+model=HuggingFaceH4/zephyr-7b-beta
+volume=/home/ubuntu/.cache/huggingface/hub/
+
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
+```
+
+Now we can generate the data using the `create_data.py` script.
+
+```bash
+python create_data.py \
+    --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
+    --output-filename zephyr_self_distill.json
+```
+
+At this point our terminal should look like this:
+
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
+        width="550"
+    />
+</div>
+
+> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.
+
+Now we can finally get to the fun part and start training the model!
+
+Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.
+
+> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.
+
+```bash
+WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
+    --model_name_or_path HuggingFaceH4/zephyr-7b-beta \
+    --data_path zephyr_self_distill.json \
+    --bf16 True \
+    --output_dir zephyr_out \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 1e-3 \
+    --weight_decay 0.0 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --medusa_num_heads 3 \
+    --medusa_num_layers 1 \
+    --deepspeed deepspeed.json
+```
+
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
+        width="550"
+    />
+</div>
+
+If successful, you should see the similar output to the one below:
+
+```bash
+wandb: Run history:
+wandb:                    train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
+wandb:              train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
+wandb:            train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
+wandb:                     train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
+wandb:             train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
+wandb:             train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
+wandb:             train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
+wandb:               train/total_flos ▁
+wandb:               train/train_loss ▁
+wandb:            train/train_runtime ▁
+wandb: train/train_samples_per_second ▁
+wandb:   train/train_steps_per_second ▁
+wandb:
+wandb: Run summary:
+wandb:                    train/epoch 2.0
+wandb:              train/global_step 16
+wandb:            train/learning_rate 0.0
+wandb:                     train/loss 14.8906
+wandb:             train/medusa0_loss 4.25
+wandb:             train/medusa0_top1 0.28809
+wandb:             train/medusa1_loss 4.8125
+wandb:             train/medusa1_top1 0.22727
+wandb:             train/medusa2_loss 5.5
+wandb:             train/medusa2_top1 0.17293
+wandb:               train/total_flos 0.0
+wandb:               train/train_loss 23.98242
+wandb:            train/train_runtime 396.9266
+wandb: train/train_samples_per_second 2.519
+wandb:   train/train_steps_per_second 0.04
+```
+
+Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.
+
+```bash
+python -m medusa.hf_utils \
+    --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
+    --repo drbh/zephyr_medusa_demo
+```
+
+Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉
--- a/docs/source/basic_tutorials/using_cli.md
+++ b/docs/source/basic_tutorials/using_cli.md
@ -1,30 +1,30 @@
 # Using TGI CLI

-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](./installation#install-cli).
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).

-`text-generation-server` lets you download the model with `download-weights` command like below 👇 
+`text-generation-server` lets you download the model with `download-weights` command like below 👇

 ```bash
 text-generation-server download-weights MODEL_HUB_ID
 ```

-You can also use it to quantize models like below 👇 
+You can also use it to quantize models like below 👇

 ```bash
-text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR 
+text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
 ```

-You can use `text-generation-launcher` to serve models. 
+You can use `text-generation-launcher` to serve models.

 ```bash
 text-generation-launcher --model-id MODEL_HUB_ID --port 8080
 ```

-There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running 
+There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running

 ```bash
 text-generation-launcher --help
-``` 
+```

 You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).

--- a/docs/source/basic_tutorials/using_guidance.md
+++ b/docs/source/basic_tutorials/using_guidance.md
@ -0,0 +1,359 @@
+# Guidance
+
+Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
+
+These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
+
+_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
+
+## How it works
+
+TGI leverages the [outlines](https://github.com/outlines-dev/outlines) library to efficiently parse and compile the grammatical structures and tools specified by users. This integration transforms the defined grammars into an intermediate representation that acts as a framework to guide and constrain content generation, ensuring that outputs adhere to the specified grammatical rules.
+
+If you are interested in the technical details on how outlines is used in TGI, you can check out the [conceptual guidance documentation](../conceptual/guidance).
+
+## Table of Contents 📚
+
+### Grammar and Constraints
+
+- [The Grammar Parameter](#the-grammar-parameter): Shape your AI's responses with precision.
+- [Constrain with Pydantic](#constrain-with-pydantic): Define a grammar using Pydantic models.
+- [JSON Schema Integration](#json-schema-integration): Fine-grained control over your requests via JSON schema.
+- [Using the client](#using-the-client): Use TGI's client libraries to shape the AI's responses.
+
+### Tools and Functions
+
+- [The Tools Parameter](#the-tools-parameter): Enhance the AI's capabilities with predefined functions.
+- [Via the client](#text-generation-inference-client): Use TGI's client libraries to interact with the Messages API and Tool functions.
+- [OpenAI integration](#openai-integration): Use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+
+## Grammar and Constraints 🛣️
+
+### The Grammar Parameter
+
+In TGI `1.4.3`, we've introduced the grammar parameter, which allows you to specify the format of the response you want from the LLM.
+
+Using curl, you can make a request to TGI's Messages API with the grammar parameter. This is the most primitive way to interact with the API and using [Pydantic](#constrain-with-pydantic) is recommended for ease of use and readability.
+
+```json
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs": "I saw a puppy a cat and a raccoon during my bike ride in the park",
+    "parameters": {
+        "repetition_penalty": 1.3,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+// {"generated_text":"{ \n\n\"activity\": \"biking\",\n\"animals\": [\"puppy\",\"cat\",\"raccoon\"],\n\"animals_seen\": 3,\n\"location\": \"park\"\n}"}
+
+```
+
+### Hugging Face Hub Python Library
+
+The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+schema = {
+    "properties": {
+        "location": {"title": "Location", "type": "string"},
+        "activity": {"title": "Activity", "type": "string"},
+        "animals_seen": {
+            "maximum": 5,
+            "minimum": 1,
+            "title": "Animals Seen",
+            "type": "integer",
+        },
+        "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
+    },
+    "required": ["location", "activity", "animals_seen", "animals"],
+    "title": "Animals",
+    "type": "object",
+}
+
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": schema},
+)
+
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+
+```
+
+A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
+
+> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
+
+### Constrain with Pydantic
+
+Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
+
+```python
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel, conint
+from typing import List
+
+
+class Animals(BaseModel):
+    location: str
+    activity: str
+    animals_seen: conint(ge=1, le=5)  # Constrained integer type
+    animals: List[str]
+
+
+client = InferenceClient("http://localhost:3000")
+
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": Animals.schema()},
+)
+
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+
+
+```
+
+defining a grammar as regular expressions
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
+
+resp = client.text_generation(
+    f"Whats Googles DNS? Please use the following regex: {regexp}",
+    seed=42,
+    grammar={
+        "type": "regex",
+        "value": regexp,
+    },
+)
+
+
+print(resp)
+# 7.1.1.1
+
+```
+
+## Tools and Functions 🛠️
+
+### The Tools Parameter
+
+In addition to the grammar parameter, we've also introduced a set of tools and functions to help you get the most out of the Messages API.
+
+Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the LLM's capabilities. Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+
+Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+
+```json
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "model": "tgi",
+    "messages": [
+        {
+            "role": "user",
+            "content": "What is the weather like in New York?"
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA"
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location."
+                        }
+                    },
+                    "required": ["location", "format"]
+                }
+            }
+        }
+    ],
+    "tool_choice": "get_current_weather"
+}'
+// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
+```
+
+### Chat Completion with Tools
+
+Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                },
+                "required": ["location", "format"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_n_day_weather_forecast",
+            "description": "Get an N-day weather forecast",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                    "num_days": {
+                        "type": "integer",
+                        "description": "The number of days to forecast",
+                    },
+                },
+                "required": ["location", "format", "num_days"],
+            },
+        },
+    },
+]
+
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "system",
+            "content": "You're a helpful assistant! Answer the users question best you can.",
+        },
+        {
+            "role": "user",
+            "content": "What is the weather like in Brooklyn, New York?",
+        },
+    ],
+    tools=tools,
+    seed=42,
+    max_tokens=100,
+)
+
+print(chat.choices[0].message.tool_calls)
+# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
+
+```
+
+### OpenAI integration
+
+TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+
+However there are some minor differences in the API, for example `tool_choice="auto"` will ALWAYS choose the tool for you. This is different from OpenAI's API where `tool_choice="auto"` will choose a tool if the model thinks it's necessary.
+
+```python
+from openai import OpenAI
+
+# Initialize the client, pointing it to one of the available models
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="_",
+)
+
+# NOTE: tools defined above and removed for brevity
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "system",
+            "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
+        },
+        {
+            "role": "user",
+            "content": "What's the weather like the next 3 days in San Francisco, CA?",
+        },
+    ],
+    tools=tools,
+    tool_choice="auto",  # tool selected by model
+    max_tokens=500,
+)
+
+
+called = chat_completion.choices[0].message.tool_calls
+print(called)
+# {
+#     "id": 0,
+#     "type": "function",
+#     "function": {
+#         "description": None,
+#         "name": "tools",
+#         "parameters": {
+#             "format": "celsius",
+#             "location": "San Francisco, CA",
+#             "num_days": 3,
+#         },
+#     },
+# }
+```
--- a/docs/source/basic_tutorials/visual_language_models.md
+++ b/docs/source/basic_tutorials/visual_language_models.md
@ -0,0 +1,230 @@
+# Vision Language Model Inference in TGI
+
+Visual Language Model (VLM) are models that consume both image and text inputs to generate text.
+
+VLM's are trained on a combination of image and text data and can handle a wide range of tasks, such as image captioning, visual question answering, and visual dialog.
+
+> What distinguishes VLMs from other text and image models is their ability to handle long context and generate text that is coherent and relevant to the image even after multiple turns or in some cases, multiple images.
+
+Below are couple of common use cases for vision language models:
+
+- **Image Captioning**: Given an image, generate a caption that describes the image.
+- **Visual Question Answering (VQA)**: Given an image and a question about the image, generate an answer to the question.
+- **Mulimodal Dialog**: Generate response to multiple turns of images and conversations.
+- **Image Information Retrieval**: Given an image, retrieve information from the image.
+
+## How to Use a Vision Language Model?
+
+### Hugging Face Hub Python Library
+
+To infer with vision language models through Python, you can use the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The `InferenceClient` class provides a simple way to interact with the [Inference API](https://huggingface.co/docs/api-inference/index). Images can be passed as URLs or base64-encoded strings. The `InferenceClient` will automatically detect the image format.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://127.0.0.1:3000")
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+prompt = f"![]({image})What is this a picture of?\n\n"
+for token in client.text_generation(prompt, max_new_tokens=16, stream=True):
+    print(token)
+
+# This is a picture of an anthropomorphic rabbit in a space suit.
+```
+
+```python
+from huggingface_hub import InferenceClient
+import base64
+import requests
+import io
+
+client = InferenceClient("http://127.0.0.1:3000")
+
+# read image from local file
+image_path = "rabbit.png"
+with open(image_path, "rb") as f:
+    image = base64.b64encode(f.read()).decode("utf-8")
+
+image = f"data:image/png;base64,{image}"
+prompt = f"![]({image})What is this a picture of?\n\n"
+
+for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
+    print(token)
+
+# This is a picture of an anthropomorphic rabbit in a space suit.
+```
+
+or via the `chat_completion` endpoint:
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://127.0.0.1:3000")
+
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    seed=42,
+    max_tokens=100,
+)
+
+print(chat)
+# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
+
+```
+
+or with OpenAi's library:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    stream=False,
+)
+
+print(chat_completion)
+# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
+```
+
+### Inference Through Sending `cURL` Requests
+
+To use the `generate_stream` endpoint with curl, you can add the `-N` flag. This flag disables curl default buffering and shows data as it arrives from the server.
+
+```bash
+curl -N 127.0.0.1:3000/generate_stream \
+    -X POST \
+    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}' \
+    -H 'Content-Type: application/json'
+
+# ...
+# data:{"index":16,"token":{"id":28723,"text":".","logprob":-0.6196289,"special":false},"generated_text":"This is a picture of an anthropomorphic rabbit in a space suit.","details":null}
+```
+
+### Inference Through JavaScript
+
+First, we need to install the `@huggingface/inference` library.
+
+```bash
+npm install @huggingface/inference
+```
+
+If you're using the free Inference API, you can use [Huggingface.js](https://huggingface.co/docs/huggingface.js/inference/README)'s `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint` class to easily interact with the Inference API.
+
+We can create a `HfInferenceEndpoint` providing our endpoint URL and We can create a `HfInferenceEndpoint` providing our endpoint URL and [Hugging Face access token](https://huggingface.co/settings/tokens).
+
+```js
+import { HfInferenceEndpoint } from "@huggingface/inference";
+
+const hf = new HfInferenceEndpoint("http://127.0.0.1:3000", "HF_TOKEN");
+
+const prompt =
+  "![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n";
+
+const stream = hf.textGenerationStream({
+  inputs: prompt,
+  parameters: { max_new_tokens: 16, seed: 42 },
+});
+for await (const r of stream) {
+  // yield the generated token
+  process.stdout.write(r.token.text);
+}
+
+// This is a picture of an anthropomorphic rabbit in a space suit.
+```
+
+## Combining Vision Language Models with Other Features
+
+VLMs in TGI have several advantages, for example these models can be used in tandem with other features for more complex tasks. For example, you can use VLMs with [Guided Generation](/docs/conceptual/guided-generation) to generate specific JSON data from an image.
+
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+        width="400"
+    />
+</div>
+
+For example we can extract information from the rabbit image and generate a JSON object with the location, activity, number of animals seen, and the animals seen. That would look like this:
+
+```json
+{
+  "activity": "Standing",
+  "animals": ["Rabbit"],
+  "animals_seen": 1,
+  "location": "Rocky surface with mountains in the background and a red light on the rabbit's chest"
+}
+```
+
+All we need to do is provide a JSON schema to the VLM model and it will generate the JSON object for us.
+
+```bash
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n",
+    "parameters": {
+        "max_new_tokens": 100,
+        "seed": 42,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+
+# {
+#   "generated_text": "{ \"activity\": \"Standing\", \"animals\": [ \"Rabbit\" ], \"animals_seen\": 1, \"location\": \"Rocky surface with mountains in the background and a red light on the rabbit's chest\" }"
+# }
+```
+
+Want to learn more about how Vision Language Models work? Check out the [awesome blog post on the topic](https://huggingface.co/blog/vlms).
--- a/docs/source/conceptual/flash_attention.md
+++ b/docs/source/conceptual/flash_attention.md
@ -0,0 +1,11 @@
+# Flash Attention
+
+Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference.
+
+Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back.
+
+![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png)
+
+It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix.
+
+You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135).
--- a/docs/source/conceptual/guidance.md
+++ b/docs/source/conceptual/guidance.md
@ -0,0 +1,86 @@
+# Guidance
+
+## What is Guidance?
+
+Guidance is a feature that allows users to constrain the generation of a large language model with a specified grammar. This feature is particularly useful when you want to generate text that follows a specific structure or uses a specific set of words or produce output in a specific format. A prominent example is JSON grammar, where the model is forced to output valid JSON.
+
+## How is it used?
+
+Guidance can be implemented in many ways and the community is always finding new ways to use it. Here are some examples of how you can use guidance:
+
+Technically, guidance can be used to generate:
+
+- a specific JSON object
+- a function signature
+- typed output like a list of integers
+
+However these use cases can span a wide range of applications, such as:
+
+- extracting structured data from unstructured text
+- summarizing text into a specific format
+- limit output to specific classes of words (act as a LLM powered classifier)
+- generate the input to specific APIs or services
+- provide reliable and consistent output for downstream tasks
+- extract data from multimodal inputs
+
+## How it works?
+
+Diving into the details, guidance is enabled by including a grammar with a generation request that is compiled, and used to modify the chosen tokens.
+
+This process can be broken down into the following steps:
+
+1. A request is sent to the backend, it is processed and placed in batch. Processing includes compiling the grammar into a finite state machine and a grammar state.
+
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/request-to-batch.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/request-to-batch-dark.gif"
+    />
+</div>
+
+2. The model does a forward pass over the batch. This returns probabilities for each token in the vocabulary for each request in the batch.
+
+3. The process of choosing one of those tokens is called `sampling`. The model samples from the distribution of probabilities to choose the next token. In TGI all of the steps before sampling are called `processor`. Grammars are applied as a processor that masks out tokens that are not allowed by the grammar.
+
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/logit-grammar-mask.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/logit-grammar-mask-dark.gif"
+    />
+</div>
+
+4. The grammar mask is applied and the model samples from the remaining tokens. Once a token is chosen, we update the grammar state with the new token, to prepare it for the next pass.
+
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/sample-logits.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/sample-logits-dark.gif"
+    />
+</div>
+
+## How to use Guidance?
+
+There are two main ways to use guidance; you can either use the `/generate` endpoint with a grammar or use the `/chat/completion` endpoint with tools.
+
+Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.
+
+Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
+
+### Getting the most out of guidance
+
+Depending on how you are using guidance, you may want to make use of different features. Here are some tips to get the most out of guidance:
+
+- If you are using the `/generate` with a `grammar` it is recommended to include the grammar in the prompt prefixed by something like `Please use the following JSON schema to generate the output:`. This will help the model understand the context of the grammar and generate the output accordingly.
+- If you are getting a response with many repeated tokens, please use the `frequency_penalty` or `repetition_penalty` to reduce the number of repeated tokens in the output.
--- a/docs/source/conceptual/lora.md
+++ b/docs/source/conceptual/lora.md
@ -0,0 +1,65 @@
+# LoRA (Low-Rank Adaptation)
+
+## What is LoRA?
+
+LoRA is a technique that allows for efficent fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.
+
+LoRA works by adding a small number of additional weights to the model, which are used to adapt the model to the new dataset or task. These additional weights are learned during the fine-tuning process, while the rest of the model's weights are kept fixed.
+
+## How is it used?
+
+LoRA can be used in many ways and the community is always finding new ways to use it. Here are some examples of how you can use LoRA:
+
+Technically, LoRA can be used to fine-tune a large language model on a small dataset. However, these use cases can span a wide range of applications, such as:
+
+- fine-tuning a language model on a small dataset
+- fine-tuning a language model on a domain-specific dataset
+- fine-tuning a language model on a dataset with limited labels
+
+## Optimizing Inference with LoRA
+
+LoRA's can be used during inference by mutliplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with mulitple LoRA models.
+
+## Serving multiple LoRA adapters with TGI
+
+Once a LoRA model has been trained, it can be used to generate text or perform other tasks just like a regular language model. However, because the model has been fine-tuned on a specific dataset, it may perform better on that dataset than a model that has not been fine-tuned.
+
+In practice its often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.
+
+Text Generation Inference (TGI) now supports loading multiple LoRA models at startup that can be used in generation requests. This feature is available starting from version `~2.0.6` and is compatible with LoRA models trained using the `peft` library.
+
+### Specifying LoRA models
+
+To use LoRA in TGI, when starting the server, you can specify the list of LoRA models to load using the `LORA_ADAPTERS` environment variable. For example:
+
+```bash
+LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
+```
+
+In the server logs, you will see the following message:
+
+```txt
+Loading adapter weights into model: predibase/customer_support
+Loading adapter weights into model: predibase/dbpedia
+```
+
+## Generate text
+
+You can then use these models in generation requests by specifying the `lora_model` parameter in the request payload. For example:
+
+```json
+curl 127.0.0.1:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+  "inputs": "Hello who are you?",
+  "parameters": {
+    "max_new_tokens": 40,
+    "adapter_id": "predibase/customer_support"
+  }
+}'
+```
+
+> **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.
+
+An updated tutorial with detailed examples will be published soon. Stay tuned!
--- a/docs/source/conceptual/paged_attention.md
+++ b/docs/source/conceptual/paged_attention.md
@ -0,0 +1,9 @@
+# PagedAttention
+
+LLMs struggle with memory limitations during generation. In the decoding part of generation, all the attention keys and values generated for previous tokens are stored in GPU memory for reuse. This is called _KV cache_, and it may take up a large amount of memory for large models and long sequences.
+
+PagedAttention attempts to optimize memory use by partitioning the KV cache into blocks that are accessed through a lookup table. Thus, the KV cache does not need to be stored in contiguous memory, and blocks are allocated as needed. The memory efficiency can increase GPU utilization on memory-bound workloads, so more inference batches can be supported.
+
+The use of a lookup table to access the memory blocks can also help with KV sharing across multiple generations. This is helpful for techniques such as _parallel sampling_, where multiple outputs are generated simultaneously for the same prompt. In this case, the cached KV blocks can be shared among the generations.
+
+TGI's PagedAttention implementation leverages the custom cuda kernels developed by the [vLLM Project](https://github.com/vllm-project/vllm). You can learn more about this technique in the [project's page](https://vllm.ai/).
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -0,0 +1,59 @@
+# Quantization
+
+TGI offers GPTQ and bits-and-bytes quantization to quantize large language models.
+
+## Quantization with GPTQ
+
+GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇
+
+Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):
+
+$$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
+
+
+TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
+```
+
+Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
+
+To quantize a given model using GPTQ with a calibration dataset, simply run
+
+```bash
+text-generation-server quantize tiiuae/falcon-40b /data/falcon-40b-gptq
+# Add --upload-to-model-id MYUSERNAME/falcon-40b to push the created model to the hub directly
+```
+
+This will create a new directory with the quantized files which you can use with,
+
+```bash
+text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-shard 2 --quantize gptq
+```
+
+You can learn more about the quantization options by running `text-generation-server quantize --help`.
+
+If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
+You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
+
+## Quantization with bitsandbytes
+
+bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
+
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
+In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes
+```
+
+4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
+
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
+```
+
+You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
--- a/docs/source/conceptual/safetensors.md
+++ b/docs/source/conceptual/safetensors.md
@ -0,0 +1,7 @@
+# Safetensors
+
+Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).
+
+TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.
+
+You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
--- a/docs/source/conceptual/speculation.md
+++ b/docs/source/conceptual/speculation.md
@ -0,0 +1,49 @@
+## Speculation
+
+
+Speculative decoding, assisted generation, Medusa, and others are a few different names for the same idea.
+The idea is to generate tokens *before* the large model actually runs, and only *check* if those tokens where valid.
+
+So you are making *more* computations on your LLM, but if you are correct you produce 1, 2, 3 etc.. tokens on a single LLM pass. Since LLMs are usually memory bound (and not compute bound), provided your guesses are correct enough, this is a 2-3x faster inference (It can be much more for code oriented tasks for instance).
+
+You can check a more [detailed explanation](https://huggingface.co/blog/assisted-generation).
+
+Text-generation inference supports 2 main speculative methods:
+
+- Medusa
+- N-gram
+
+
+### Medusa
+
+
+Medusa is a [simple method](https://arxiv.org/abs/2401.10774) to create many tokens in a single pass using fine-tuned LM heads in addition to your existing models.
+
+
+You can check a few existing  fine-tunes for popular models:
+
+- [text-generation-inference/gemma-7b-it-medusa](https://huggingface.co/text-generation-inference/gemma-7b-it-medusa)
+- [text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa](https://huggingface.co/text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa)
+- [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
+
+
+In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md)
+
+
+In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.
+
+
+### N-gram
+
+
+If you don't have a medusa model, or don't have the resource to fine-tune, you can try to use `n-gram`.
+N-gram works by trying to find matching tokens in the previous sequence, and use those as speculation for generating new tokens. For example, if the tokens "np.mean" appear multiple times in the sequence, the model can speculate that the next continuation of the tokens "np." is probably also "mean".
+
+This is an extremely simple method, which works best for code, or highly repetitive text. This might not be beneficial, if the speculation misses too much.
+
+
+In order to enable n-gram speculation simply use
+
+`--speculate 2` in your flags.
+
+[Details about the flag](https://huggingface.co/docs/text-generation-inference/basic_tutorials/launcher#speculate)
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@ -5,17 +5,17 @@
 Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.

 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"
    />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"
    />
 </div>

-With token streaming, the server can start returning the tokens one by one before having to generate the whole response. Users can have a sense of the generation's quality earlier than the end of the generation. This has different positive effects:
+With token streaming, the server can start returning the tokens one by one before having to generate the whole response. Users can have a sense of the generation's quality before the end of the generation. This has different positive effects:

 * Users can get results orders of magnitude earlier for extremely long queries.
 * Seeing something in progress allows users to stop the generation if it's not going in the direction they expect.
@ -25,14 +25,14 @@ With token streaming, the server can start returning the tokens one by one befor
 For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below.

 <div class="block dark:hidden">
-	<iframe 
+	<iframe
        src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
        width="850"
        height="350"
    ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
        src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
        width="850"
        height="350"
@ -43,7 +43,7 @@ For example, a system can generate 100 tokens per second. If the system generate

 ### Streaming with Python

-To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response. 
+To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response.

 ```python
 from huggingface_hub import InferenceClient
@ -116,20 +116,20 @@ curl -N 127.0.0.1:8080/generate_stream \
 First, we need to install the `@huggingface/inference` library.
 `npm install @huggingface/inference`

-If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's 
+If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`.

 We can create a `HfInferenceEndpoint` providing our endpoint URL and credential.

 ```js
-import { HfInference } from '@huggingface/inference'
+import { HfInferenceEndpoint } from '@huggingface/inference'

-const hf = new HfInference('https://YOUR_ENDPOINT.endpoints.huggingface.cloud', 'hf_YOUR_TOKEN')
+const hf = new HfInferenceEndpoint('https://YOUR_ENDPOINT.endpoints.huggingface.cloud', 'hf_YOUR_TOKEN')

 // prompt
 const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips'

 const stream = hf.textGenerationStream({ inputs: prompt })
-for await (const r of stream) { 
+for await (const r of stream) {
  // yield the generated token
  process.stdout.write(r.token.text)
 }
@ -143,6 +143,4 @@ SSEs are different than:
 * Polling: where the client keeps calling the server to get data. This means that the server might return empty responses and cause overhead.
 * Webhooks: where there is a bi-directional connection. The server can send information to the client, but the client can also send data to the server after the first request. Webhooks are more complex to operate as they don’t only use HTTP.

-One of the limitations of Server-Sent Events is that they limit how many concurrent requests can handle by the server. Instead of timing out when there are too many SSE connections, TGI returns a HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g. it could display a busy error to the user or it could retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing to handle backpressure.
-
-One of the limitations of Server-Sent Events is that they limit how many concurrent requests can handle by the server. Instead of timing out when there are too many SSE connections, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure.
+If there are too many requests at the same time, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure.
--- a/docs/source/conceptual/tensor_parallelism.md
+++ b/docs/source/conceptual/tensor_parallelism.md
@ -0,0 +1,14 @@
+# Tensor Parallelism
+
+Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇
+
+![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)
+
+
+<Tip warning={true}>
+
+Tensor Parallelism only works for [models officially supported](../supported_models), it will not work when falling back to `transformers`. You can get more information about unsupported models [here](../basic_tutorials/non_core_models).
+
+</Tip>
+
+You can learn a lot more details about tensor-parallelism from [the `transformers` docs](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism).
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -18,7 +18,8 @@ Text Generation Inference implements many optimizations and features, such as:
 - Logits warper (temperature scaling, top-p, top-k, repetition penalty)
 - Stop sequences
 - Log probabilities
-
+- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance.
+- [Guidance](../conceptual/guidance): Enable function calling and tool-use by forcing the model to generate structured outputs based on your own predefined output schemas.

 Text Generation Inference is used in production by multiple projects, such as:

--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,10 +1,14 @@
-# Installation
+# Installation from source

-This section explains how to install the CLI tool as well as installing TGI from source. **The strongly recommended approach is to use Docker, as it does not require much setup. Check [the Quick Tour](./quicktour) to learn how to run TGI with Docker.**
+<Tip warning={true}>
+
+Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker.
+
+</Tip>

 ## Install CLI

-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. 
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters.

 To install the CLI, you need to first clone the TGI repository and then run `make`.

@ -23,7 +27,7 @@ BUILD_EXTENSIONS=True make install

 Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**.

-Text Generation Inference is available on pypi, conda and GitHub. 
+Text Generation Inference is available on pypi, conda and GitHub.

 To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using conda:
--- a/docs/source/installation_amd.md
+++ b/docs/source/installation_amd.md
@ -0,0 +1,38 @@
+# Using TGI with AMD GPUs
+
+TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs.
+
+On a server powered by AMD GPUs, TGI can be launched with the following command:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
+    --device=/dev/kfd --device=/dev/dri --group-add video \
+    --ipc=host --shm-size 256g --net host -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1-rocm \
+    --model-id $model
+```
+
+The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
+
+## TunableOp
+
+TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt.
+
+Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3.
+
+TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container.
+
+## Flash attention implementation
+
+Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/layers/attention/flash_attn_triton.py).
+
+By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.
+
+## Unsupported features
+
+The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
+* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
+* Kernel for sliding window attention (Mistral)
--- a/docs/source/installation_gaudi.md
+++ b/docs/source/installation_gaudi.md
@ -0,0 +1,3 @@
+# Using TGI with Intel Gaudi
+
+Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index).
--- a/docs/source/installation_inferentia.md
+++ b/docs/source/installation_inferentia.md
@ -0,0 +1,3 @@
+# Using TGI with Inferentia
+
+Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
--- a/docs/source/installation_nvidia.md
+++ b/docs/source/installation_nvidia.md
@ -0,0 +1,18 @@
+# Using TGI with Nvidia GPUs
+
+TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it.
+
+For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
+
+TGI can be used on NVIDIA GPUs through its official docker image:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1 \
+    --model-id $model
+```
+
+The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
--- a/docs/source/messages_api.md
+++ b/docs/source/messages_api.md
@ -0,0 +1,175 @@
+# Messages API
+
+Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
+
+#### Table of Contents
+
+- [Making a Request](#making-a-request)
+- [Streaming](#streaming)
+- [Synchronous](#synchronous)
+- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
+- [Cloud Providers](#cloud-providers)
+  - [Amazon SageMaker](#amazon-sagemaker)
+
+## Making a Request
+
+You can make a request to TGI's Messages API using `curl`. Here's an example:
+
+```bash
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
+    -H 'Content-Type: application/json'
+```
+
+## Streaming
+
+You can also use OpenAI's Python client library to make a streaming request. Here's how:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message)
+```
+
+## Synchronous
+
+If you prefer to make a synchronous request, you can do so like this:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=False
+)
+
+print(chat_completion)
+```
+
+## Hugging Face Inference Endpoints
+
+The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
+Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library:
+
+> **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key.
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    # replace with your endpoint url, make sure to include "v1/" at the end
+    base_url="https://vlzz10eq3fol3429.us-east-1.aws.endpoints.huggingface.cloud/v1/",
+    # replace with your API key
+    api_key="hf_XXX"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message.choices[0].delta.content, end="")
+```
+
+## Cloud Providers
+
+TGI can be deployed on various cloud providers for scalable and robust text generation. One such provider is Amazon SageMaker, which has recently added support for TGI. Here's how you can deploy TGI on Amazon SageMaker:
+
+## Amazon SageMaker
+
+To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
+
+This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+
+```python
+import json
+import sagemaker
+import boto3
+from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
+
+try:
+ role = sagemaker.get_execution_role()
+except ValueError:
+ iam = boto3.client('iam')
+ role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
+
+# Hub Model configuration. https://huggingface.co/models
+hub = {
+ 'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
+ 'SM_NUM_GPUS': json.dumps(1),
+ 'MESSAGES_API_ENABLED': True
+}
+
+# create Hugging Face Model Class
+huggingface_model = HuggingFaceModel(
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ env=hub,
+ role=role,
+)
+
+# deploy model to SageMaker Inference
+predictor = huggingface_model.deploy(
+ initial_instance_count=1,
+ instance_type="ml.g5.2xlarge",
+ container_startup_health_check_timeout=300,
+  )
+
+# send request
+predictor.predict({
+"messages": [
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ]
+})
+```
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -2,24 +2,27 @@

 The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).

-Let's say you want to deploy [Falcon-7B Instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) model with TGI. Here is an example on how to do that:
+## Launching TGI
+
+Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that:

 ```bash
-model=tiiuae/falcon-7b-instruct
+model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1 \
+    --model-id $model
 ```

-<Tip warning={true}>
+### Supported hardware

-To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)  . We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
+TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.

-</Tip>
+## Consuming TGI

 Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.

-
 <inferencesnippet>
 <python>

@ -47,7 +50,7 @@ print(response.json())
 ```js
 async function query() {
    const response = await fetch(
-        'http://127.0.0.1:8080/generate', 
+        'http://127.0.0.1:8080/generate',
        {
            method: 'POST',
            headers: { 'Content-Type': 'application/json'},
@ -85,7 +88,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:1.0.2 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.1.1 --help
 ```

 </Tip>
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -1,23 +1,38 @@
+
 # Supported Models and Hardware

 Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.

 ## Supported Models

-The following models are optimized and can be served with TGI, which uses custom CUDA kernels for better inference. You can add the flag `--disable-custom-kernels` at the end of the `docker run` command if you wish to disable them.
-
- [BLOOM](https://huggingface.co/bigscience/bloom)
- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl)
+- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
+- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
+- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
+- [Gemma](https://huggingface.co/google/gemma-7b)
+- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
+- [Gemma2](https://huggingface.co/google/gemma2-9b)
+- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
+- [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
+- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
+- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
+- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
+- [Phi](https://huggingface.co/microsoft/phi-1_5)
+- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
+- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
+- [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
+- [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
+- [Opt](https://huggingface.co/facebook/opt-6.7b)
+- [T5](https://huggingface.co/google/flan-t5-xxl)
 - [Galactica](https://huggingface.co/facebook/galactica-120b)
- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
- [Llama](https://github.com/facebookresearch/llama)
- [OPT](https://huggingface.co/facebook/opt-66b)
 - [SantaCoder](https://huggingface.co/bigcode/santacoder)
- [Starcoder](https://huggingface.co/bigcode/starcoder)
- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
- [MPT](https://huggingface.co/mosaicml/mpt-30b)
- [Llama V2](https://huggingface.co/meta-llama)
+- [Bloom](https://huggingface.co/bigscience/bloom-560m)
+- [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
+- [Gpt2](https://huggingface.co/openai-community/gpt2)
+- [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
+- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
+

 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:

@ -28,13 +43,8 @@ AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
 AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")
 ```

+If you wish to serve a supported model that already exists on a local folder, just point to the local folder.

-## Supported Hardware
-
-TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other hardware, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. 
-
-TGI is also supported on the following AI hardware accelerators:
- *Habana first-gen Gaudi and Gaudi2:* check out this [example](https://github.com/huggingface/optimum-habana/tree/main/text-generation-inference) how to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
-
-
-
+```bash
+text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
+```
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,29 +1,66 @@
-import sys
-import subprocess
-import contextlib
-import pytest
 import asyncio
-import os
-import docker
+import contextlib
 import json
 import math
-import time
+import os
 import random
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from typing import Dict, List, Optional

-from docker.errors import NotFound
-from typing import Optional, List, Dict
-from syrupy.extensions.json import JSONSnapshotExtension
+import docker
+import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
-
+from docker.errors import NotFound
+from syrupy.extensions.json import JSONSnapshotExtension
 from text_generation import AsyncClient
-from text_generation.types import Response, Details, InputToken, Token, BestOfSequence
+from text_generation.types import (
+    BestOfSequence,
+    ChatComplete,
+    ChatCompletionChunk,
+    ChatCompletionComplete,
+    Completion,
+    Details,
+    Grammar,
+    InputToken,
+    Response,
+    Token,
+)

 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--release", action="store_true", default=False, help="run release tests"
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "release: mark test as a release-only test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--release"):
+        # --release given in cli: do not skip release tests
+        return
+    skip_release = pytest.mark.skip(reason="need --release option to run")
+    for item in items:
+        if "release" in item.keywords:
+            item.add_marker(skip_release)


 class ResponseComparator(JSONSnapshotExtension):
+    rtol = 0.2
+    ignore_logprob = False
+
    def serialize(
        self,
        data,
@ -31,8 +68,16 @@ class ResponseComparator(JSONSnapshotExtension):
        exclude=None,
        matcher=None,
    ):
+        if (
+            isinstance(data, Response)
+            or isinstance(data, ChatComplete)
+            or isinstance(data, ChatCompletionChunk)
+            or isinstance(data, ChatCompletionComplete)
+        ):
+            data = data.model_dump()
+
        if isinstance(data, List):
-            data = [d.dict() for d in data]
+            data = [d.model_dump() for d in data]

        data = self._filter(
            data=data, depth=0, path=(), exclude=exclude, matcher=matcher
@ -47,10 +92,24 @@ class ResponseComparator(JSONSnapshotExtension):
    ) -> bool:
        def convert_data(data):
            data = json.loads(data)
+            if isinstance(data, Dict) and "choices" in data:
+                choices = data["choices"]
+                if isinstance(choices, List) and len(choices) >= 1:
+                    if "delta" in choices[0]:
+                        return ChatCompletionChunk(**data)
+                    if "text" in choices[0]:
+                        return Completion(**data)
+                return ChatComplete(**data)

            if isinstance(data, Dict):
                return Response(**data)
            if isinstance(data, List):
+                if (
+                    len(data) > 0
+                    and "object" in data[0]
+                    and data[0]["object"] == "text_completion"
+                ):
+                    return [Completion(**d) for d in data]
                return [Response(**d) for d in data]
            raise NotImplementedError

@ -58,7 +117,10 @@ class ResponseComparator(JSONSnapshotExtension):
            return (
                token.id == other.id
                and token.text == other.text
-                and math.isclose(token.logprob, other.logprob, rel_tol=0.2)
+                and (
+                    self.ignore_logprob
+                    or math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
+                )
                and token.special == other.special
            )

@ -68,7 +130,12 @@ class ResponseComparator(JSONSnapshotExtension):
                    prefill_token.id == other.id
                    and prefill_token.text == other.text
                    and (
-                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=0.2)
+                        self.ignore_logprob
+                        or math.isclose(
+                            prefill_token.logprob,
+                            other.logprob,
+                            rel_tol=self.rtol,
+                        )
                        if prefill_token.logprob is not None
                        else prefill_token.logprob == other.logprob
                    )
@ -130,6 +197,19 @@ class ResponseComparator(JSONSnapshotExtension):
                )
            )

+        def eq_completion(response: Completion, other: Completion) -> bool:
+            return response.choices[0].text == other.choices[0].text
+
+        def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
+            return (
+                response.choices[0].message.content == other.choices[0].message.content
+            )
+
+        def eq_chat_complete_chunk(
+            response: ChatCompletionChunk, other: ChatCompletionChunk
+        ) -> bool:
+            return response.choices[0].delta.content == other.choices[0].delta.content
+
        def eq_response(response: Response, other: Response) -> bool:
            return response.generated_text == other.generated_text and eq_details(
                response.details, other.details
@ -143,11 +223,38 @@ class ResponseComparator(JSONSnapshotExtension):
        if not isinstance(snapshot_data, List):
            snapshot_data = [snapshot_data]

+        if isinstance(serialized_data[0], Completion):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
+        if isinstance(serialized_data[0], ChatComplete):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
+        if isinstance(serialized_data[0], ChatCompletionChunk):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [
+                    eq_chat_complete_chunk(r, o)
+                    for r, o in zip(serialized_data, snapshot_data)
+                ]
+            )
+
        return len(snapshot_data) == len(serialized_data) and all(
            [eq_response(r, o) for r, o in zip(serialized_data, snapshot_data)]
        )


+class GenerousResponseComparator(ResponseComparator):
+    # Needed for GPTQ with exllama which has serious numerical fluctuations.
+    rtol = 0.75
+
+
+class IgnoreLogProbResponseComparator(ResponseComparator):
+    ignore_logprob = True
+
+
 class LauncherHandle:
    def __init__(self, port: int):
        self.client = AsyncClient(f"http://localhost:{port}")
@ -194,6 +301,16 @@ def response_snapshot(snapshot):
    return snapshot.use_extension(ResponseComparator)


+@pytest.fixture
+def generous_response_snapshot(snapshot):
+    return snapshot.use_extension(GenerousResponseComparator)
+
+
+@pytest.fixture
+def ignore_logprob_response_snapshot(snapshot):
+    return snapshot.use_extension(IgnoreLogProbResponseComparator)
+
+
@pytest.fixture(scope="module")
 def event_loop():
    loop = asyncio.get_event_loop()
@ -210,6 +327,12 @@ def launcher(event_loop):
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
        use_flash_attention: bool = True,
+        disable_grammar_support: bool = False,
+        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
+        max_input_length: Optional[int] = None,
+        max_batch_prefill_tokens: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
    ):
        port = random.randint(8000, 10_000)
        master_port = random.randint(10_000, 20_000)
@ -232,32 +355,52 @@ def launcher(event_loop):

        env = os.environ

+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
        if quantize is not None:
            args.append("--quantize")
            args.append(quantize)
+        if dtype is not None:
+            args.append("--dtype")
+            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
+        if max_input_length:
+            args.append("--max-input-length")
+            args.append(str(max_input_length))
+        if max_batch_prefill_tokens:
+            args.append("--max-batch-prefill-tokens")
+            args.append(str(max_batch_prefill_tokens))
+        if max_total_tokens:
+            args.append("--max-total-tokens")
+            args.append(str(max_total_tokens))

        env["LOG_LEVEL"] = "info,text_generation_router=debug"

        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"

-        with subprocess.Popen(
-            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
-        ) as process:
-            yield ProcessLauncherHandle(process, port)
+        with tempfile.TemporaryFile("w+") as tmp:
+            # We'll output stdout/stderr to a temporary file. Using a pipe
+            # cause the process to block until stdout is read.
+            with subprocess.Popen(
+                args,
+                stdout=tmp,
+                stderr=subprocess.STDOUT,
+                env=env,
+            ) as process:
+                yield ProcessLauncherHandle(process, port)

-            process.terminate()
-            process.wait(60)
+                process.terminate()
+                process.wait(60)

-            launcher_output = process.stdout.read().decode("utf-8")
-            print(launcher_output, file=sys.stderr)
-
-            process.stdout.close()
-            process.stderr.close()
+                tmp.seek(0)
+                shutil.copyfileobj(tmp, sys.stderr)

        if not use_flash_attention:
            del env["USE_FLASH_ATTENTION"]
@ -269,18 +412,41 @@ def launcher(event_loop):
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
        use_flash_attention: bool = True,
+        disable_grammar_support: bool = False,
+        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
+        max_input_length: Optional[int] = None,
+        max_batch_prefill_tokens: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
    ):
        port = random.randint(8000, 10_000)

        args = ["--model-id", model_id, "--env"]

+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
        if quantize is not None:
            args.append("--quantize")
            args.append(quantize)
+        if dtype is not None:
+            args.append("--dtype")
+            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
+        if max_input_length:
+            args.append("--max-input-length")
+            args.append(str(max_input_length))
+        if max_batch_prefill_tokens:
+            args.append("--max-batch-prefill-tokens")
+            args.append(str(max_batch_prefill_tokens))
+        if max_total_tokens:
+            args.append("--max-total-tokens")
+            args.append(str(max_total_tokens))

        client = docker.from_env()

@ -295,17 +461,31 @@ def launcher(event_loop):

        gpu_count = num_shard if num_shard is not None else 1

-        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+        env = {
+            "LOG_LEVEL": "info,text_generation_router=debug",
+        }
        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"

-        if HUGGING_FACE_HUB_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+        if HF_TOKEN is not None:
+            env["HF_TOKEN"] = HF_TOKEN

        volumes = []
        if DOCKER_VOLUME:
            volumes = [f"{DOCKER_VOLUME}:/data"]

+        if DOCKER_DEVICES:
+            devices = DOCKER_DEVICES.split(",")
+            visible = os.getenv("ROCR_VISIBLE_DEVICES")
+            if visible:
+                env["ROCR_VISIBLE_DEVICES"] = visible
+            device_requests = []
+        else:
+            devices = []
+            device_requests = [
+                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
+            ]
+
        container = client.containers.run(
            DOCKER_IMAGE,
            command=args,
@ -313,11 +493,11 @@ def launcher(event_loop):
            environment=env,
            auto_remove=False,
            detach=True,
-            device_requests=[
-                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
-            ],
+            device_requests=device_requests,
+            devices=devices,
            volumes=volumes,
            ports={"80/tcp": port},
+            shm_size="1G",
        )

        yield ContainerLauncherHandle(client, container.name, port)
@ -344,11 +524,22 @@ def launcher(event_loop):
@pytest.fixture(scope="module")
 def generate_load():
    async def generate_load_inner(
-        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+        client: AsyncClient,
+        prompt: str,
+        max_new_tokens: int,
+        n: int,
+        seed: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+        stop_sequences: Optional[List[str]] = None,
    ) -> List[Response]:
        futures = [
            client.generate(
-                prompt, max_new_tokens=max_new_tokens, decoder_input_details=True
+                prompt,
+                max_new_tokens=max_new_tokens,
+                decoder_input_details=True,
+                seed=seed,
+                grammar=grammar,
+                stop_sequences=stop_sequences,
            )
            for _ in range(n)
        ]
--- a/integration-tests/images/chicken_on_money.png
+++ b/integration-tests/images/chicken_on_money.png
--- a/integration-tests/images/cow_beach.png
+++ b/integration-tests/images/cow_beach.png
--- a/integration-tests/models/snapshots/test_chat_llama/test_flash_llama_simple.json
+++ b/integration-tests/models/snapshots/test_chat_llama/test_flash_llama_simple.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to explore in the middle of urban confines. In fact, typical times for humidity levels in Brooklyn include:\n\n- Early morning: 80-85% humidity, with occas",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1716553098,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.5-dev0-native",
+  "usage": {
+    "completion_tokens": 100,
+    "prompt_tokens": 62,
+    "total_tokens": 162
+  }
+}
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@ -0,0 +1,38 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 1,
+      "logprobs": null,
+      "text": " PR for more information?"
+    },
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": "le Business Incubator is providing a workspace"
+    },
+    {
+      "finish_reason": "length",
+      "index": 2,
+      "logprobs": null,
+      "text": " severely flawed and often has a substandard"
+    },
+    {
+      "finish_reason": "length",
+      "index": 3,
+      "logprobs": null,
+      "text": "hd20220811-"
+    }
+  ],
+  "created": 1713284455,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 36,
+    "prompt_tokens": 8,
+    "total_tokens": 44
+  }
+}
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
@ -0,0 +1,602 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "hd"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "aho"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "2"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "2"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "2"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "ima"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "."
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "."
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "."
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " Sarah"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " Yes"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " And"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "i"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "'"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": ","
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " what"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "'"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "s"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " Moh"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " is"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "m"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " Room"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "s"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " the"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": " tired"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": ":"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "'"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " capital"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": " of"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " She"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " scale"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " of"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": " being"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  }
+]
--- a/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_single_prompt.json
+++ b/integration-tests/models/snapshots/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": " PR for flake8"
+    }
+  ],
+  "created": 1713284454,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 5,
+    "prompt_tokens": 6,
+    "total_tokens": 11
+  }
+}
--- a/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq.json
+++ b/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -7.703125,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.4765625,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.390625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.8583984,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.7548828,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9306641,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 5618,
+        "logprob": -2.4550781,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.5732422,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 278,
+        "logprob": -1.5761719,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 4328,
+        "logprob": -1.5888672,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 1546,
+        "logprob": -0.026504517,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 21784,
+        "logprob": -1.4287109,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -0.15856934,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 322,
+        "logprob": -0.17456055,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 6189,
+        "logprob": -0.62646484,
+        "special": false,
+        "text": " Machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+}
--- a/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 338,
+        "logprob": -9.0859375,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -10.90625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -2.65625,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -4.8085938,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.19958496,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 4013,
+        "logprob": -2.203125,
+        "special": false,
+        "text": "This"
+      },
+      {
+        "id": 1139,
+        "logprob": -0.23693848,
+        "special": false,
+        "text": " question"
+      },
+      {
+        "id": 756,
+        "logprob": 0.0,
+        "special": false,
+        "text": " has"
+      },
+      {
+        "id": 1063,
+        "logprob": -0.076538086,
+        "special": false,
+        "text": " been"
+      },
+      {
+        "id": 4433,
+        "logprob": 0.0,
+        "special": false,
+        "text": " asked"
+      },
+      {
+        "id": 1784,
+        "logprob": -1.1367188,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 3064,
+        "logprob": 0.0,
+        "special": false,
+        "text": " times"
+      },
+      {
+        "id": 322,
+        "logprob": -1.7460938,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 306,
+        "logprob": 0.0,
+        "special": false,
+        "text": " I"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is Deep Learning?\nThis question has been asked many times and I"
+}
--- a/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_load.json
+++ b/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_load.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8583984,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
+++ b/integration-tests/models/snapshots/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
+++ b/integration-tests/models/snapshots/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -7.6914062,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.4746094,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.390625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.8623047,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.7558594,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9228516,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 5618,
+        "logprob": -2.4609375,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.57177734,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 278,
+        "logprob": -1.5722656,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 4328,
+        "logprob": -1.5927734,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 1546,
+        "logprob": -0.026428223,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 21784,
+        "logprob": -1.4267578,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -0.16015625,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 322,
+        "logprob": -0.17382812,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 6189,
+        "logprob": -0.62060547,
+        "special": false,
+        "text": " Machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -10.0,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1736,
+        "logprob": -2.09375,
+        "special": false,
+        "text": " form"
+      },
+      {
+        "id": 109,
+        "logprob": -1.8671875,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 651,
+        "logprob": -2.4375,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 2121,
+        "logprob": -1.8203125,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 3853,
+        "logprob": -0.23242188,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 1736,
+        "logprob": -0.08544922,
+        "special": false,
+        "text": " form"
+      },
+      {
+        "id": 603,
+        "logprob": -0.9375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1671,
+        "logprob": -1.671875,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 577,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3853,
+        "logprob": -1.1875,
+        "special": false,
+        "text": " request"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " form\n\nThe test request form is used to request"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_all_params.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -10.0,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 7539,
+        "logprob": -0.73046875,
+        "special": false,
+        "text": " forms"
+      },
+      {
+        "id": 708,
+        "logprob": 0.0,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 671,
+        "logprob": -1.703125,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 8727,
+        "logprob": 0.0,
+        "special": false,
+        "text": " essential"
+      },
+      {
+        "id": 1702,
+        "logprob": 0.0,
+        "special": false,
+        "text": " part"
+      },
+      {
+        "id": 576,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 573,
+        "logprob": 0.0,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 11859,
+        "logprob": -1.6953125,
+        "special": false,
+        "text": " lab"
+      },
+      {
+        "id": 2185,
+        "logprob": -1.3125,
+        "special": false,
+        "text": " process"
+      },
+      {
+        "id": 578,
+        "logprob": -1.5,
+        "special": false,
+        "text": " and"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request forms are an essential part of the lab process and"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_load.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_load.json
@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_gemma_gptq/test_flash_gemma_gptq.json
+++ b/integration-tests/models/snapshots/test_flash_gemma_gptq/test_flash_gemma_gptq.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -9.640625,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.34375,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 604,
+        "logprob": -2.4296875,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 573,
+        "logprob": -2.4453125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2412,
+        "logprob": -2.8632812,
+        "special": false,
+        "text": " following"
+      },
+      {
+        "id": 235292,
+        "logprob": -2.1328125,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 109,
+        "logprob": -0.76660156,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 235287,
+        "logprob": -1.3837891,
+        "special": false,
+        "text": "*"
+      },
+      {
+        "id": 235248,
+        "logprob": -1.9746094,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 199,
+        "logprob": -1.4189453,
+        "special": false,
+        "text": "<strong>"
+      },
+      {
+        "id": 1232,
+        "logprob": -4.34375,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 208,
+        "logprob": -0.8852539,
+        "special": false,
+        "text": "</strong>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " for the following:\n\n* <strong>Name</strong>"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -9.65625,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.3671875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 604,
+        "logprob": -0.36938477,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 235248,
+        "logprob": -1.8046875,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 235274,
+        "logprob": -0.46240234,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 235284,
+        "logprob": -1.7460938,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 235265,
+        "logprob": -1.9443359,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 235284,
+        "logprob": -1.4550781,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 235308,
+        "logprob": -1.0205078,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 235290,
+        "logprob": -1.0283203,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 235274,
+        "logprob": -1.2783203,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 235284,
+        "logprob": 0.0,
+        "special": false,
+        "text": "2"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request for 12.25-12"
+}
--- a/Show More
+++ b/Show More