diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 12c93b9e..24ac3cbe 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -5,14 +5,14 @@ body:
     id: system-info
     attributes:
       label: System Info
-      description: | 
+      description: |
         Please share your system info with us (`text-generation-launcher --env` if installed locally).
-        The full command line used that causes issues: 
+        The full command line used that causes issues:
         OS version:
         Rust version (if self-compiling, `cargo version`):
         Model being used (`curl 127.0.0.1:8080/info | jq`):
           If local model please explicit the kind of model and/or equivalents.
-        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
         Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
         The current version being used:
 
@@ -52,11 +52,11 @@ body:
 
       placeholder: |
         Steps to reproduce the behavior:
-          
+
           1.
           2.
           3.
-          
+
 
   - type: textarea
     id: expected-behavior
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 5abc1565..f1a9135c 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -19,7 +19,7 @@ body:
       label: Motivation
       description: |
         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+
 
   - type: textarea
     id: contribution
diff --git a/.github/workflows/autodocs.yaml b/.github/workflows/autodocs.yaml
new file mode 100644
index 00000000..8af0b95d
--- /dev/null
+++ b/.github/workflows/autodocs.yaml
@@ -0,0 +1,40 @@
+name: Automatic Documentation for Launcher
+
+on:
+  pull_request:
+
+jobs:
+  update_docs:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        profile: minimal
+        toolchain: stable
+
+    - name: Install Protocol Buffers compiler
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y protobuf-compiler libprotobuf-dev
+
+    - name: Install Launcher
+      id: install-launcher
+      run: cargo install --path launcher/
+
+    - name: Install router
+      id: install-router
+      run: cargo install --path router/
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+
+    - name: Check that documentation is up-to-date
+      run: |
+        python update_doc.py --check
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 124e6a33..8213887f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -1,68 +1,34 @@
 name: Build and push docker image to internal registry
 
 on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'main'
-    tags:
-      - 'v*'
-  pull_request:
-    paths:
-      - ".github/workflows/build.yaml"
-      - "integration-tests/**"
-      - "server/**"
-      - "proto/**"
-      - "router/**"
-      - "launcher/**"
-      - "Cargo.lock"
-      - "rust-toolchain.toml"
-      - "Dockerfile"
-    branches:
-      - 'main'
+  workflow_call:
+    inputs:
+      hardware:
+        type: string
+        description: Hardware
+          # options:
+          # - cuda
+          # - rocm
+          # - intel
+        required: true
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ami-03cfed9ea28f4b002
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc
-      EC2_SECURITY_GROUP: sg-030175c435ac141d6
+  build-and-push:
     outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
-  build-and-push-image:
+      docker_image: ${{ steps.final.outputs.docker_image }}
+      docker_devices: ${{ steps.final.outputs.docker_devices }}
+      runs_on: ${{ steps.final.outputs.runs_on }}
+      label: ${{ steps.final.outputs.label }}
     concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    # TODO see with @Glegendre to get CPU runner here instead
+    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
     permissions:
       contents: write
       packages: write
@@ -72,38 +38,60 @@ jobs:
       security-events: write
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Initialize Docker Buildx
-        uses: docker/setup-buildx-action@v2.0.0
-        with:
-          install: true
+        uses: actions/checkout@v4
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4.4.1
-      - name: Install cosign
-        if: github.event_name != 'pull_request'
-        uses: sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6 #v2.6.0
+      - name: Construct harware variables
+        shell: bash
+        run: |
+          case ${{ inputs.hardware }} in
+            cuda)
+                export dockerfile="Dockerfile"
+                export label_extension=""
+                export docker_devices=""
+                export runs_on="nvidia-gpu"
+                ;;
+            rocm)
+                export dockerfile="Dockerfile_amd"
+                export label_extension="-rocm"
+                export docker_devices="/dev/kfd,/dev/dri"
+                # TODO Re-enable when they pass.
+                # export runs_on="amd-gpu-tgi"
+                export runs_on="ubuntu-latest"
+                ;;
+            intel)
+                export dockerfile="Dockerfile_intel"
+                export label_extension="-intel"
+                export docker_devices=""
+                export runs_on="ubuntu-latest"
+                ;;
+          esac
+          echo $dockerfile
+          echo "Dockerfile=${dockerfile}"
+          echo $label_extension
+          echo $docker_devices
+          echo $runs_on
+          echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
+          echo "LABEL=${label_extension}" >> $GITHUB_ENV
+          echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
+          echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
+      - name: Initialize Docker Buildx
+        uses: docker/setup-buildx-action@v3
         with:
-          cosign-release: 'v1.13.1'
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+          install: true
+          config-inline: |
+            [registry."docker.io"]
+              mirrors = ["registry.github-runners.huggingface.tech"]
       - name: Login to GitHub Container Registry
         if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Login to internal Container Registry
-        uses: docker/login-action@v2.1.0
-        with:
-          username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
-          password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
-          registry: registry.internal.huggingface.tech
       - name: Login to Azure Container Registry
         if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2.1.0
+        uses: docker/login-action@v3
         with:
           username: ${{ secrets.AZURE_DOCKER_USERNAME }}
           password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
@@ -112,12 +100,12 @@ jobs:
       - name: Extract metadata (tags, labels) for Docker
         if: ${{ github.event_name == 'pull_request' }}
         id: meta-pr
-        uses: docker/metadata-action@v4.3.0
+        uses: docker/metadata-action@v5
         with:
           images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
           tags: |
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
       # If main, release or tag
       - name: Extract metadata (tags, labels) for Docker
         if: ${{ github.event_name != 'pull_request' }}
@@ -127,120 +115,62 @@ jobs:
           flavor: |
             latest=auto
           images: |
-            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
+            registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference
             ghcr.io/huggingface/text-generation-inference
             db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
           tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
+            type=semver,pattern={{version}}${{ env.LABEL }}
+            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
+            type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
       - name: Build and push Docker image
         id: build-and-push
         uses: docker/build-push-action@v4
         with:
           context: .
-          file: Dockerfile
+          file: ${{ env.DOCKERFILE }}
           push: true
           platforms: 'linux/amd64'
           build-args: |
             GIT_SHA=${{ env.GITHUB_SHA }}
-            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
-          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
-          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
-      # Sign the resulting Docker image digest except on PRs.
-      # This will only write to the public Rekor transparency log when the Docker
-      # repository is public to avoid leaking data.
-      - name: Sign the published Docker image
-        if: ${{ github.event_name != 'pull_request' }}
-        env:
-          COSIGN_EXPERIMENTAL: "true"
-        # This step uses the identity token to provision an ephemeral certificate
-        # against the sigstore community Fulcio instance.
-        run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign {}@${{ steps.build-and-push.outputs.digest }}
-      - name: Run Trivy in GitHub SBOM mode and submit results to Dependency Graph
-        uses: aquasecurity/trivy-action@master
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          image-ref: 'ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}'
-          format: 'github'
-          output: 'dependency-results.sbom.json'
-          github-pat: ${{ secrets.GITHUB_TOKEN }}
-          scanners: 'vuln'
-      - name: Run Trivy vulnerability scanner
-        uses: aquasecurity/trivy-action@master
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          image-ref: 'ghcr.io/huggingface/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}'
-          format: 'sarif'
-          output: 'trivy-results.sarif'
-          severity: 'CRITICAL'
-          scanners: 'vuln'
-      - name: Upload Trivy scan results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@v2
-        if: ${{ github.event_name != 'pull_request' }}
-        with:
-          sarif_file: 'trivy-results.sarif'
-
-  integration-tests:
+          cache-from: type=registry,ref=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min
+          cache-to: type=registry,ref=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:cache${{ env.LABEL }},mode=min
+      - name: Final
+        id: final
+        run: |
+          echo "docker_image=registry-push.github-runners.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+          echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
+          echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
+          echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+  integration_tests:
     concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs:
-      - start-runner
-      - build-and-push-image # Wait for the docker image to be built
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    needs: build-and-push
+    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
+    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
     env:
-      DOCKER_VOLUME: /cache
+      PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout repository
+        uses: actions/checkout@v4
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4.4.1
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.9
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
+          python-version: "3.10"
       - name: Install
         run: |
           make install-integration-tests
       - name: Run tests
         run: |
-          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv integration-tests
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - build-and-push-image
-      - integration-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+          export DOCKER_VOLUME=/mnt/cache
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          echo $DOCKER_IMAGE
+          pytest -s -vv integration-tests ${PYTEST_FLAGS}
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yaml
similarity index 80%
rename from .github/workflows/build_documentation.yml
rename to .github/workflows/build_documentation.yaml
index a0f1d6f1..4d0b19a3 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yaml
@@ -17,5 +17,4 @@ jobs:
       package: text-generation-inference
       additional_args: --not_python_module
     secrets:
-      token: ${{ secrets.HUGGINGFACE_PUSH }}
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
\ No newline at end of file
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yaml
similarity index 87%
rename from .github/workflows/build_pr_documentation.yml
rename to .github/workflows/build_pr_documentation.yaml
index b46216ec..bf03bfdf 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yaml
@@ -11,9 +11,9 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: text-generation-inference
-      additional_args: --not_python_module 
+      additional_args: --not_python_module
diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml
new file mode 100644
index 00000000..d62297e4
--- /dev/null
+++ b/.github/workflows/ci_build.yaml
@@ -0,0 +1,45 @@
+name: CI build
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+  pull_request:
+    paths:
+      - ".github/workflows/build.yaml"
+      - "integration-tests/**"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - "Dockerfile"
+      - "Dockerfile_amd"
+      - "Dockerfile_intel"
+    branches:
+      - "main"
+  workflow_dispatch:
+    inputs:
+      release-tests:
+        description: "Run release integration tests"
+        required: true
+        default: false
+        type: boolean
+
+jobs:
+  build:
+    strategy:
+      # super important if you want to see all results, even if one fails
+      # fail-fast is true by default
+      fail-fast: false
+      matrix:
+        hardware: ["cuda", "rocm", "intel"]
+    uses: ./.github/workflows/build.yaml # calls the one above ^
+    with:
+      hardware: ${{ matrix.hardware }}
+      # https://github.com/actions/runner/issues/2206
+      release-tests: ${{ inputs.release-tests == true }}
+    secrets: inherit
diff --git a/.github/workflows/client-tests.yaml b/.github/workflows/client-tests.yaml
index 1fa0b39d..ff2928c4 100644
--- a/.github/workflows/client-tests.yaml
+++ b/.github/workflows/client-tests.yaml
@@ -22,4 +22,5 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest pytest-asyncio
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
           make python-client-tests
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
deleted file mode 100644
index 1cad807b..00000000
--- a/.github/workflows/delete_doc_comment.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: Delete doc comment
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
\ No newline at end of file
diff --git a/.github/workflows/integration_tests.yaml b/.github/workflows/integration_tests.yaml
new file mode 100644
index 00000000..59a8d304
--- /dev/null
+++ b/.github/workflows/integration_tests.yaml
@@ -0,0 +1,41 @@
+name: Integration tests
+
+on:
+  workflow_call:
+    inputs:
+      docker_image:
+        type: string
+        description: Hardware
+        required: true
+      docker_devices:
+        type: string
+        description: Hardware
+      runs_on:
+        type: string
+        required: true
+        description: Hardware to run integration tests
+jobs:
+  integration_tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    runs-on: ${{ inputs.runs_on }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_VOLUME=/mnt/cache
+          export DOCKER_IMAGE=${{ inputs.docker_image }}
+          export DOCKER_DEVICES=${{ inputs.docker_devices }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          pytest -s -vv integration-tests
diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml
index fd22e395..637df472 100644
--- a/.github/workflows/load_test.yaml
+++ b/.github/workflows/load_test.yaml
@@ -11,66 +11,24 @@ on:
       - 'main'
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-      EC2_AMI_ID: ami-0ab09c07cfd194259
-      EC2_INSTANCE_TYPE: g5.12xlarge
-      EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
-      EC2_SECURITY_GROUP: sg-072f92ae3082936c6
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-tgi-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-
   load-tests:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
     env:
       DOCKER_VOLUME: /cache
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
 
-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
-
       - name: Install k6
         run: |
           curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
 
       - name: Start starcoder
         run: |
-          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
+          docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HF_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
           sleep 10
           wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
 
@@ -82,27 +40,3 @@ jobs:
         if: ${{ always() }}
         run: |
           docker stop tgi-starcoder || true
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - load-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: eu-central-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
new file mode 100644
index 00000000..a5e50a79
--- /dev/null
+++ b/.github/workflows/stale.yaml
@@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 311ee6b9..f983b6ed 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -33,11 +33,17 @@ jobs:
       - name: Install Rust
         uses: actions-rs/toolchain@v1
         with:
-          toolchain: 1.71.0
+          # Released on: 02 May, 2024
+          # https://releases.rs/docs/1.78.0/
+          toolchain: 1.79.0
           override: true
           components: rustfmt, clippy
       - name: Install Protoc
         uses: arduino/setup-protoc@v1
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
       - name: Install sccache
         run: |
           curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
@@ -62,18 +68,17 @@ jobs:
             ~/.cargo/git
       - name: Install
         run: |
-          make install
+          make install-cpu
       - name: Run server tests
         run: |
           pip install pytest
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
           pytest -s -vv server/tests
-      - name: Run Rust fmt
+      - name: Pre-commit checks
         run: |
-          cargo fmt --check
-      - name: Run Rust clippy
-        run: |
-          cargo clippy
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
       - name: Run Rust tests
         run: |
           cargo test
diff --git a/.github/workflows/trufflehog.yaml b/.github/workflows/trufflehog.yaml
new file mode 100644
index 00000000..b406d43b
--- /dev/null
+++ b/.github/workflows/trufflehog.yaml
@@ -0,0 +1,18 @@
+on:
+  push:
+
+name: Secret Leaks
+
+permissions:
+  contents: read
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yaml
similarity index 85%
rename from .github/workflows/upload_pr_documentation.yml
rename to .github/workflows/upload_pr_documentation.yaml
index b984ead2..ae00bb51 100644
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yaml
@@ -13,4 +13,4 @@ jobs:
       package_name: text-generation-inference
     secrets:
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 20c9baee..e9ad1808 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,15 @@
 target
 router/tokenizer.json
 *__pycache__*
+
+# ROCm auto-generated files
+*.hip
+server/exllamav2_kernels/exllamav2_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip_func/
+*_hip.cuh
+server/exllama_kernels/exllama_kernels/hip_buffers.cuh
+server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
+
+data/
+load_tests/*.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..45bc07a5
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        exclude: docs/source/basic_tutorials/launcher.md
+-   repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/doublify/pre-commit-rust
+    rev: v1.0
+    hooks:
+    -   id: fmt
+    -   id: cargo-check
+    -   id: clippy
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..b23f3150
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,133 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.1, available at
+[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
+[https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..d541e47f
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,120 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Contribute to text-generation-inference
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+contributions are not the only way to help the community. Answering questions, helping
+others, and improving the documentation are also immensely valuable.
+
+It also helps us if you spread the word! Reference the library in blog posts
+about the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply ⭐️ the repository to say thank you.
+
+However you choose to contribute, please be mindful and respect our
+[code of conduct](https://github.com/huggingface/text-generation-inference/blob/main/CODE_OF_CONDUCT.md).
+
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
+
+## Ways to contribute
+
+There are several ways you can contribute to text-generation-inference.
+
+* Fix outstanding issues with the existing code.
+* Submit issues related to bugs or desired new features.
+* Contribute to the examples or to the documentation.
+
+> All contributions are equally valuable to the community. 🥰
+
+## Fixing outstanding issues
+
+If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open
+a Pull Request!
+
+## Submitting a bug-related issue or feature request
+
+Do your best to follow these guidelines when submitting a bug-related issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter.
+
+Before you report an issue, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the
+library itself, and not your code.
+
+Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so
+we can quickly resolve it:
+
+* Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies).
+* A short, self-contained, code snippet that allows us to reproduce the bug.
+* The *full* traceback if an exception is raised.
+* Attach any other additional information, like screenshots, you think may help.
+
+To get the OS and software versions automatically, you can re-run the launcher with the `--env` flag:
+
+```bash
+text-generation-launcher --env
+```
+
+This will precede the launch of the model with the information relative to your environment. We recommend pasting
+that in your issue report.
+
+### Do you want a new feature?
+
+If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe:
+
+1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it
+   a feature related to something you need for a project? Is it something you worked on and think it could benefit
+   the community?
+
+   Whatever it is, we'd love to hear about it!
+
+2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better
+   we'll be able to help you.
+3. Provide a *code snippet* that demonstrates the feature's usage.
+4. If the feature is related to a paper, please include a link.
+
+If your issue is well written we're already 80% of the way there by the time you create it.
+
+We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE)
+to help you get started with your issue.
+
+## Do you want to implement a new model?
+
+New models are constantly released and if you want to implement a new model, please provide the following information:
+
+* A short description of the model and a link to the paper.
+* Link to the implementation if it is open-sourced.
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can help you add it to text-generation-inference!
+
+## Do you want to add documentation?
+
+We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know
+how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be
+happy to make the changes or help you make a contribution if you're interested!
+
+## I want to become a maintainer of the project. How do I get there?
+
+TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have
+motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference
+service.
+
+If you are such an individual (or organization), please reach out to us and let's collaborate.
diff --git a/Cargo.lock b/Cargo.lock
index 0f3b39de..090e2e80 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.20.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
 dependencies = [
  "gimli",
 ]
@@ -17,106 +17,118 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
-[[package]]
-name = "aes"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2"
-dependencies = [
- "cfg-if",
- "cipher",
- "cpufeatures",
-]
-
 [[package]]
 name = "ahash"
-version = "0.8.3"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
+checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
+ "getrandom",
  "once_cell",
+ "serde",
  "version_check",
+ "zerocopy",
 ]
 
 [[package]]
 name = "aho-corasick"
-version = "0.7.20"
+version = "1.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
-name = "aho-corasick"
-version = "1.0.3"
+name = "aligned-vec"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b8f9420f797f2d9e935edf629310eb938a0d839f984e25327f3c7eed22300c"
-dependencies = [
- "memchr",
-]
+checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.6.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle"
-version = "1.0.1"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd"
+checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b"
 
 [[package]]
 name = "anstyle-parse"
-version = "0.2.1"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
+checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4"
 dependencies = [
  "utf8parse",
 ]
 
 [[package]]
 name = "anstyle-query"
-version = "1.0.0"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
+checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.2"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c"
+checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19"
 dependencies = [
  "anstyle",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.72"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b13c32d80ecc7ab747b80c3784bce54ee8a7a0cc4fbda9bf4cda2cf6fe90854"
+checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+
+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
 
 [[package]]
 name = "arc-swap"
-version = "1.6.0"
+version = "1.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
+
+[[package]]
+name = "arg_enum_proc_macro"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.68",
+]
+
+[[package]]
+name = "arrayvec"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 
 [[package]]
 name = "async-rustls"
@@ -125,7 +137,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "93b21a03b7c21702a0110f9f8d228763a533570deb376119042dabf33c37a01a"
 dependencies = [
  "futures-io",
- "rustls",
+ "rustls 0.20.9",
  "webpki",
 ]
 
@@ -148,37 +160,66 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "async-trait"
-version = "0.1.73"
+version = "0.1.80"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0"
+checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
-name = "autocfg"
-version = "1.1.0"
+name = "atomic-waker"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
+name = "autocfg"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+
+[[package]]
+name = "av1-grain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6678909d8c5d46a42abcf571271e15fdbc0a225e3646cf23762cd415046c78bf"
+dependencies = [
+ "anyhow",
+ "arrayvec",
+ "log",
+ "nom",
+ "num-rational",
+ "v_frame",
+]
 
 [[package]]
 name = "average"
-version = "0.13.1"
+version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843ec791d3f24503bbf72bbd5e49a3ab4dbb4bcd0a8ef6b0c908efa73caa27b1"
+checksum = "c309b1c7fca12ebeec3ecba29ea917b3a4cb458ccf504df68bb4d8a0ca565a00"
 dependencies = [
  "easy-cast",
  "float-ord",
  "num-traits",
 ]
 
+[[package]]
+name = "avif-serialize"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "876c75a42f6364451a033496a14c44bffe41f5f4a8236f697391f11024e596d2"
+dependencies = [
+ "arrayvec",
+]
+
 [[package]]
 name = "awaitdrop"
 version = "0.1.2"
@@ -191,6 +232,33 @@ dependencies = [
  "slotmap",
 ]
 
+[[package]]
+name = "aws-lc-rs"
+version = "1.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf7d844e282b4b56750b2d4e893b2205581ded8709fddd2b6aa5418c150ca877"
+dependencies = [
+ "aws-lc-sys",
+ "mirai-annotations",
+ "paste",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3a2c29203f6bf296d01141cc8bb9dbd5ecd4c27843f2ee0767bcd5985a927da"
+dependencies = [
+ "bindgen",
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+ "libc",
+ "paste",
+]
+
 [[package]]
 name = "axum"
 version = "0.6.20"
@@ -198,13 +266,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
 dependencies = [
  "async-trait",
- "axum-core",
+ "axum-core 0.3.4",
  "bitflags 1.3.2",
  "bytes",
  "futures-util",
- "http",
- "http-body",
- "hyper",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.29",
  "itoa",
  "matchit",
  "memchr",
@@ -216,13 +284,47 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "serde_urlencoded",
- "sync_wrapper",
+ "sync_wrapper 0.1.2",
  "tokio",
  "tower",
  "tower-layer",
  "tower-service",
 ]
 
+[[package]]
+name = "axum"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
+dependencies = [
+ "async-trait",
+ "axum-core 0.4.3",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "hyper 1.3.1",
+ "hyper-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sync_wrapper 1.0.1",
+ "tokio",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.3.4"
@@ -232,8 +334,8 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http",
- "http-body",
+ "http 0.2.12",
+ "http-body 0.4.6",
  "mime",
  "rustversion",
  "tower-layer",
@@ -241,26 +343,49 @@ dependencies = [
 ]
 
 [[package]]
-name = "axum-tracing-opentelemetry"
-version = "0.10.0"
+name = "axum-core"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "164b95427e83b79583c7699a72b4a6b485a12bbdef5b5c054ee5ff2296d82f52"
+checksum = "a15c63fd72d41492dc4f497196f5da1fb04fb7529e631d73630d1b491e47a2e3"
 dependencies = [
- "axum",
- "futures",
- "http",
- "opentelemetry 0.18.0",
- "tower",
- "tower-http 0.3.5",
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper 0.1.2",
+ "tower-layer",
+ "tower-service",
  "tracing",
- "tracing-opentelemetry 0.18.0",
+]
+
+[[package]]
+name = "axum-tracing-opentelemetry"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdad298231394729042d1f155b93f9fdf0b5ee1aea0b62404c4d7341f7d8fe08"
+dependencies = [
+ "axum 0.7.5",
+ "futures-core",
+ "futures-util",
+ "http 1.1.0",
+ "opentelemetry 0.21.0",
+ "pin-project-lite",
+ "tower",
+ "tracing",
+ "tracing-opentelemetry 0.22.0",
+ "tracing-opentelemetry-instrumentation-sdk",
 ]
 
 [[package]]
 name = "backtrace"
-version = "0.3.68"
+version = "0.3.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
 dependencies = [
  "addr2line",
  "cc",
@@ -279,15 +404,59 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
 
 [[package]]
 name = "base64"
-version = "0.21.2"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
-name = "base64ct"
-version = "1.6.0"
+name = "base64"
+version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bindgen"
+version = "0.69.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
+dependencies = [
+ "bitflags 2.6.0",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.12.1",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.68",
+ "which",
+]
+
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
+[[package]]
+name = "bit_field"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
 
 [[package]]
 name = "bitflags"
@@ -297,9 +466,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.0"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+
+[[package]]
+name = "bitstream-io"
+version = "2.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "415f8399438eb5e4b2f73ed3152a3448b98149dda642a957ee704e1daa5cf1d8"
 
 [[package]]
 name = "block-buffer"
@@ -311,70 +486,77 @@ dependencies = [
 ]
 
 [[package]]
-name = "bumpalo"
-version = "3.13.0"
+name = "built"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+checksum = "c6a6c0b39c38fd754ac338b00a88066436389c0f029da5d37d1e01091d9b7c17"
+
+[[package]]
+name = "bumpalo"
+version = "3.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "bytecount"
-version = "0.6.3"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
+checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
+
+[[package]]
+name = "bytemuck"
+version = "1.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
 
 [[package]]
 name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "byteorder-lite"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
 
 [[package]]
 name = "bytes"
-version = "1.4.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
+checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
 
 [[package]]
-name = "bzip2"
-version = "0.4.4"
+name = "camino"
+version = "1.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
+checksum = "e0ec6b951b160caa93cc0c7b209e5a3bff7aae9062213451ac99493cd844c239"
 dependencies = [
- "bzip2-sys",
- "libc",
+ "serde",
 ]
 
 [[package]]
-name = "bzip2-sys"
-version = "0.1.11+1.0.8"
+name = "cargo-platform"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
+checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc"
 dependencies = [
- "cc",
- "libc",
- "pkg-config",
+ "serde",
 ]
 
 [[package]]
-name = "cached-path"
-version = "0.6.1"
+name = "cargo_metadata"
+version = "0.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "097968e38f1319207f057d0f4d76452e4f4f847a5de61c5215379f297fa034f3"
+checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037"
 dependencies = [
- "flate2",
- "fs2",
- "glob",
- "indicatif 0.16.2",
- "log",
- "rand",
- "reqwest",
+ "camino",
+ "cargo-platform",
+ "semver",
  "serde",
  "serde_json",
- "sha2",
- "tar",
- "tempfile",
  "thiserror",
- "zip",
 ]
 
 [[package]]
@@ -385,12 +567,32 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
 
 [[package]]
 name = "cc"
-version = "1.0.82"
+version = "1.0.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01"
+checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d"
 dependencies = [
  "jobserver",
  "libc",
+ "once_cell",
+]
+
+[[package]]
+name = "cexpr"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-expr"
+version = "0.15.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02"
+dependencies = [
+ "smallvec",
+ "target-lexicon",
 ]
 
 [[package]]
@@ -400,31 +602,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
-name = "cipher"
-version = "0.4.4"
+name = "cfg_aliases"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
 dependencies = [
- "crypto-common",
- "inout",
+ "glob",
+ "libc",
+ "libloading",
 ]
 
 [[package]]
 name = "clap"
-version = "4.3.21"
+version = "4.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c27cdf28c0f604ba3f512b0c9a409f8de8513e4816705deb0498b627e7c3a3fd"
+checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
 dependencies = [
  "clap_builder",
  "clap_derive",
- "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.21"
+version = "4.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08a9f1ab5e9f01a9b81f202e8562eb9a10de70abf9eaeac1be465c28b75aa4aa"
+checksum = "f7e204572485eb3fbf28f871612191521df159bc3e15a9f5064c66dba3a8c05f"
 dependencies = [
  "anstream",
  "anstyle",
@@ -434,52 +642,61 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.3.12"
+version = "4.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050"
+checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
+
+[[package]]
+name = "cmake"
+version = "0.1.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "color_quant"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorchoice"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422"
 
 [[package]]
 name = "console"
-version = "0.15.7"
+version = "0.15.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8"
+checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
 dependencies = [
  "encode_unicode",
  "lazy_static",
  "libc",
  "unicode-width",
- "windows-sys 0.45.0",
+ "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "constant_time_eq"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
-
 [[package]]
 name = "core-foundation"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -487,78 +704,69 @@ dependencies = [
 
 [[package]]
 name = "core-foundation-sys"
-version = "0.8.4"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504"
 dependencies = [
  "libc",
 ]
 
 [[package]]
 name = "crc32fast"
-version = "1.3.2"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
 dependencies = [
  "cfg-if",
 ]
 
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.8"
+version = "0.5.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
+checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
 dependencies = [
- "cfg-if",
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
  "crossbeam-epoch",
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.15"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
  "crossbeam-utils",
- "memoffset 0.9.0",
- "scopeguard",
 ]
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.16"
+version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294"
-dependencies = [
- "cfg-if",
-]
+checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 
 [[package]]
 name = "crossterm"
-version = "0.26.1"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a84cda67535339806297f1b331d6dd6320470d2a0fe65381e79ee9e156dd3d13"
+checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "crossterm_winapi",
  "libc",
  "mio",
@@ -577,6 +785,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
 [[package]]
 name = "crypto-common"
 version = "0.1.6"
@@ -589,19 +803,19 @@ dependencies = [
 
 [[package]]
 name = "ctrlc"
-version = "3.4.0"
+version = "3.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a011bbe2c35ce9c1f143b7af6f94f29a167beb4cd1d29e6740ce836f723120e"
+checksum = "672465ae37dc1bc6380a6547a8883d5dd397b0f1faaad4f265726cc7042a5345"
 dependencies = [
  "nix",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "darling"
-version = "0.14.4"
+version = "0.20.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
+checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -609,77 +823,67 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.14.4"
+version = "0.20.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
+checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120"
 dependencies = [
  "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn 1.0.109",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.14.4"
+version = "0.20.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
+checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178"
 dependencies = [
  "darling_core",
  "quote",
- "syn 1.0.109",
-]
-
-[[package]]
-name = "dashmap"
-version = "5.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
-dependencies = [
- "cfg-if",
- "hashbrown 0.14.0",
- "lock_api",
- "once_cell",
- "parking_lot_core",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "deranged"
-version = "0.3.7"
+version = "0.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7684a49fb1af197853ef7b2ee694bc1f5b4179556f1e5710e1760c5db6f5e929"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+]
 
 [[package]]
 name = "derive_builder"
-version = "0.12.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8"
+checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7"
 dependencies = [
  "derive_builder_macro",
 ]
 
 [[package]]
 name = "derive_builder_core"
-version = "0.12.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
+checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "derive_builder_macro"
-version = "0.12.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
+checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b"
 dependencies = [
  "derive_builder_core",
- "syn 1.0.109",
+ "syn 2.0.68",
 ]
 
 [[package]]
@@ -690,43 +894,49 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
  "crypto-common",
- "subtle",
 ]
 
 [[package]]
 name = "dirs"
-version = "4.0.0"
+version = "5.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059"
+checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
 dependencies = [
  "dirs-sys",
 ]
 
 [[package]]
 name = "dirs-sys"
-version = "0.3.7"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6"
+checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
 dependencies = [
  "libc",
+ "option-ext",
  "redox_users",
- "winapi",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
-name = "easy-cast"
-version = "0.4.4"
+name = "dunce"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bd102ee8c418348759919b83b81cdbdc933ffe29740b903df448b4bafaa348e"
+checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b"
+
+[[package]]
+name = "easy-cast"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6"
 dependencies = [
  "libm",
 ]
 
 [[package]]
 name = "either"
-version = "1.9.0"
+version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
 [[package]]
 name = "encode_unicode"
@@ -736,9 +946,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
 
 [[package]]
 name = "encoding_rs"
-version = "0.8.32"
+version = "0.8.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
+checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
 dependencies = [
  "cfg-if",
 ]
@@ -751,50 +961,62 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.2"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 dependencies = [
- "errno-dragonfly",
- "libc",
- "windows-sys 0.48.0",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
-dependencies = [
- "cc",
  "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "esaxx-rs"
-version = "0.1.8"
+version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35"
+checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6"
 dependencies = [
  "cc",
 ]
 
 [[package]]
-name = "fastrand"
-version = "2.0.0"
+name = "exr"
+version = "1.72.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+checksum = "887d93f60543e9a9362ef8a21beedd0a833c5d9610e18c67abe15a5963dcb1a4"
+dependencies = [
+ "bit_field",
+ "flume",
+ "half",
+ "lebe",
+ "miniz_oxide",
+ "rayon-core",
+ "smallvec",
+ "zune-inflate",
+]
 
 [[package]]
-name = "filetime"
-version = "0.2.22"
+name = "fancy-regex"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0"
+checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2"
 dependencies = [
- "cfg-if",
- "libc",
- "redox_syscall 0.3.5",
- "windows-sys 0.48.0",
+ "bit-set",
+ "regex",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
+
+[[package]]
+name = "fdeflate"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f9bfee30e4dedf0ab8b422f03af778d9612b63f502710fc500a334ebe2de645"
+dependencies = [
+ "simd-adler32",
 ]
 
 [[package]]
@@ -805,9 +1027,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
 [[package]]
 name = "flate2"
-version = "1.0.26"
+version = "1.0.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
+checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
@@ -827,14 +1049,10 @@ checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853"
 
 [[package]]
 name = "flume"
-version = "0.10.14"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577"
+checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
 dependencies = [
- "futures-core",
- "futures-sink",
- "nanorand",
- "pin-project",
  "spin 0.9.8",
 ]
 
@@ -861,28 +1079,34 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
 
 [[package]]
 name = "form_urlencoded"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
 dependencies = [
  "percent-encoding",
 ]
 
 [[package]]
-name = "fs2"
-version = "0.4.3"
+name = "fraction"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
+checksum = "3027ae1df8d41b4bed2241c8fdad4acc1e7af60c8e17743534b545e77182d678"
 dependencies = [
- "libc",
- "winapi",
+ "lazy_static",
+ "num",
 ]
 
 [[package]]
-name = "futures"
-version = "0.3.28"
+name = "fs_extra"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
+[[package]]
+name = "futures"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -895,9 +1119,9 @@ dependencies = [
 
 [[package]]
 name = "futures-channel"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -905,15 +1129,15 @@ dependencies = [
 
 [[package]]
 name = "futures-core"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
 
 [[package]]
 name = "futures-executor"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
 dependencies = [
  "futures-core",
  "futures-task",
@@ -922,38 +1146,38 @@ dependencies = [
 
 [[package]]
 name = "futures-io"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
 
 [[package]]
 name = "futures-macro"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "futures-sink"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
 
 [[package]]
 name = "futures-task"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
 [[package]]
 name = "futures-util"
-version = "0.3.28"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
 dependencies = [
  "futures-channel",
  "futures-core",
@@ -988,9 +1212,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.10"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -1000,10 +1224,20 @@ dependencies = [
 ]
 
 [[package]]
-name = "gimli"
-version = "0.27.3"
+name = "gif"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e"
+checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2"
+dependencies = [
+ "color_quant",
+ "weezl",
+]
+
+[[package]]
+name = "gimli"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
 
 [[package]]
 name = "glob"
@@ -1015,31 +1249,60 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 name = "grpc-metadata"
 version = "0.1.0"
 dependencies = [
- "opentelemetry 0.19.0",
- "tonic 0.9.2",
+ "opentelemetry 0.20.0",
+ "tonic 0.10.2",
  "tracing",
- "tracing-opentelemetry 0.19.0",
+ "tracing-opentelemetry 0.21.0",
 ]
 
 [[package]]
 name = "h2"
-version = "0.3.20"
+version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97ec8491ebaf99c8eaa73058b045fe58073cd6be7f596ac993ced0b0a0c01049"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
 dependencies = [
  "bytes",
  "fnv",
  "futures-core",
  "futures-sink",
  "futures-util",
- "http",
- "indexmap 1.9.3",
+ "http 0.2.12",
+ "indexmap 2.2.6",
  "slab",
  "tokio",
  "tokio-util",
  "tracing",
 ]
 
+[[package]]
+name = "h2"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http 1.1.0",
+ "indexmap 2.2.6",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1048,19 +1311,13 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
 [[package]]
 name = "hashbrown"
-version = "0.13.1"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33ff8ae62cd3a9102e5637afc8452c55acf3844001bd5374e0b0bd7b6616c038"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
 ]
 
-[[package]]
-name = "hashbrown"
-version = "0.14.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
-
 [[package]]
 name = "heck"
 version = "0.4.1"
@@ -1068,18 +1325,45 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
 [[package]]
-name = "hermit-abi"
-version = "0.3.2"
+name = "heck"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
-name = "hmac"
-version = "0.12.1"
+name = "hermit-abi"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
+
+[[package]]
+name = "hf-hub"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732"
 dependencies = [
- "digest",
+ "dirs",
+ "futures",
+ "indicatif",
+ "log",
+ "native-tls",
+ "num_cpus",
+ "rand",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "ureq",
+]
+
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1095,9 +1379,20 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "0.2.9"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482"
+checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
 dependencies = [
  "bytes",
  "fnv",
@@ -1106,26 +1401,43 @@ dependencies = [
 
 [[package]]
 name = "http-body"
-version = "0.4.5"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1"
+checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
 dependencies = [
  "bytes",
- "http",
+ "http 0.2.12",
  "pin-project-lite",
 ]
 
 [[package]]
-name = "http-range-header"
-version = "0.3.1"
+name = "http-body"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f"
+checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "pin-project-lite",
+]
 
 [[package]]
 name = "httparse"
-version = "1.8.0"
+version = "1.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
 
 [[package]]
 name = "httpdate"
@@ -1135,35 +1447,75 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 
 [[package]]
 name = "hyper"
-version = "0.14.27"
+version = "0.14.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffb1cfd654a8219eaef89881fdb3bb3b1cdc5fa75ded05d6933b2b382e395468"
+checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33"
 dependencies = [
  "bytes",
  "futures-channel",
  "futures-core",
  "futures-util",
- "h2",
- "http",
- "http-body",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
  "httparse",
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2 0.4.9",
+ "socket2",
  "tokio",
  "tower-service",
  "tracing",
  "want",
 ]
 
+[[package]]
+name = "hyper"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "h2 0.4.5",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155"
+dependencies = [
+ "futures-util",
+ "http 1.1.0",
+ "hyper 1.3.1",
+ "hyper-util",
+ "log",
+ "rustls 0.23.10",
+ "rustls-native-certs",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper",
+ "hyper 0.14.29",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -1176,12 +1528,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
 dependencies = [
  "bytes",
- "hyper",
+ "hyper 0.14.29",
  "native-tls",
  "tokio",
  "tokio-native-tls",
 ]
 
+[[package]]
+name = "hyper-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "hyper 1.3.1",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -1190,14 +1562,53 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
 
 [[package]]
 name = "idna"
-version = "0.4.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
 dependencies = [
  "unicode-bidi",
  "unicode-normalization",
 ]
 
+[[package]]
+name = "image"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd54d660e773627692c524beaad361aca785a4f9f5730ce91f42aabe5bce3d11"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "color_quant",
+ "exr",
+ "gif",
+ "image-webp",
+ "num-traits",
+ "png",
+ "qoi",
+ "ravif",
+ "rayon",
+ "rgb",
+ "tiff",
+ "zune-core",
+ "zune-jpeg",
+]
+
+[[package]]
+name = "image-webp"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d730b085583c4d789dfd07fdcf185be59501666a90c97c40162b37e4fdad272d"
+dependencies = [
+ "byteorder-lite",
+ "thiserror",
+]
+
+[[package]]
+name = "imgref"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44feda355f4159a7c757171a77de25daf6411e217b4cabd03bd6650690468126"
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@@ -1210,81 +1621,86 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.0.0"
+version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d"
+checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "serde",
 ]
 
 [[package]]
 name = "indicatif"
-version = "0.15.0"
+version = "0.17.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4"
+checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3"
 dependencies = [
  "console",
- "lazy_static",
- "number_prefix 0.3.0",
- "regex",
+ "instant",
+ "number_prefix",
+ "portable-atomic",
+ "unicode-width",
 ]
 
 [[package]]
-name = "indicatif"
-version = "0.16.2"
+name = "indoc"
+version = "2.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b"
+checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
+
+[[package]]
+name = "init-tracing-opentelemetry"
+version = "0.14.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17"
 dependencies = [
- "console",
- "lazy_static",
- "number_prefix 0.4.0",
- "regex",
+ "opentelemetry 0.20.0",
+ "opentelemetry-otlp",
+ "thiserror",
+ "tracing",
+ "tracing-opentelemetry 0.21.0",
 ]
 
 [[package]]
-name = "inout"
-version = "0.1.3"
+name = "instant"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
+checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
 dependencies = [
- "generic-array",
+ "cfg-if",
+]
+
+[[package]]
+name = "interpolate_name"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "ipnet"
-version = "2.8.0"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
+checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
 
 [[package]]
-name = "is-terminal"
-version = "0.4.9"
+name = "is_terminal_polyfill"
+version = "1.70.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
-dependencies = [
- "hermit-abi",
- "rustix",
- "windows-sys 0.48.0",
-]
+checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800"
 
 [[package]]
-name = "itertools"
-version = "0.8.2"
+name = "iso8601"
+version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
+checksum = "924e5d73ea28f59011fec52a0d12185d496a9b075d360657aed2a5707f701153"
 dependencies = [
- "either",
-]
-
-[[package]]
-name = "itertools"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
-dependencies = [
- "either",
+ "nom",
 ]
 
 [[package]]
@@ -1297,58 +1713,155 @@ dependencies = [
 ]
 
 [[package]]
-name = "itoa"
-version = "1.0.9"
+name = "itertools"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
-version = "0.1.26"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e"
 dependencies = [
  "libc",
 ]
 
 [[package]]
-name = "js-sys"
-version = "0.3.64"
+name = "jpeg-decoder"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0"
+
+[[package]]
+name = "js-sys"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
 dependencies = [
  "wasm-bindgen",
 ]
 
 [[package]]
-name = "lazy_static"
-version = "1.4.0"
+name = "jsonschema"
+version = "0.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "2a071f4f7efc9a9118dfb627a0a94ef247986e1ab8606a4c806ae2b3aa3b6978"
+dependencies = [
+ "ahash",
+ "anyhow",
+ "base64 0.21.7",
+ "bytecount",
+ "clap",
+ "fancy-regex",
+ "fraction",
+ "getrandom",
+ "iso8601",
+ "itoa",
+ "memchr",
+ "num-cmp",
+ "once_cell",
+ "parking_lot",
+ "percent-encoding",
+ "regex",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "time",
+ "url",
+ "uuid",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
+[[package]]
+name = "lebe"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "libloading"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d"
+dependencies = [
+ "cfg-if",
+ "windows-targets 0.52.5",
+]
 
 [[package]]
 name = "libm"
-version = "0.2.7"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "libredox"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
+dependencies = [
+ "bitflags 2.6.0",
+ "libc",
+]
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.5"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "lock_api"
-version = "0.4.10"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -1356,24 +1869,24 @@ dependencies = [
 
 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
 
 [[package]]
-name = "mach2"
-version = "0.4.1"
+name = "loop9"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d0d1830bcd151a6fc4aea1369af235b36c1528fe976b8ff678683c9995eade8"
+checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062"
 dependencies = [
- "libc",
+ "imgref",
 ]
 
 [[package]]
 name = "macro_rules_attribute"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862"
+checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13"
 dependencies = [
  "macro_rules_attribute-proc_macro",
  "paste",
@@ -1381,9 +1894,9 @@ dependencies = [
 
 [[package]]
 name = "macro_rules_attribute-proc_macro"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
+checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
 
 [[package]]
 name = "match_cfg"
@@ -1402,33 +1915,25 @@ dependencies = [
 
 [[package]]
 name = "matchit"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed1202b2a6f884ae56f04cff409ab315c5ce26b5e58d7412e484f01fd52f52ef"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
+[[package]]
+name = "maybe-rayon"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519"
+dependencies = [
+ "cfg-if",
+ "rayon",
+]
 
 [[package]]
 name = "memchr"
-version = "2.5.0"
+version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
-
-[[package]]
-name = "memoffset"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "memoffset"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
-dependencies = [
- "autocfg",
-]
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "metrics"
@@ -1442,16 +1947,29 @@ dependencies = [
 ]
 
 [[package]]
-name = "metrics-exporter-prometheus"
-version = "0.12.1"
+name = "metrics"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a4964177ddfdab1e3a2b37aec7cf320e14169abb0ed73999f558136409178d5"
+checksum = "884adb57038347dfbaf2d5065887b6cf4312330dc8e94bc30a1a839bd79d3261"
 dependencies = [
- "base64 0.21.2",
- "hyper",
- "indexmap 1.9.3",
+ "ahash",
+ "portable-atomic",
+]
+
+[[package]]
+name = "metrics-exporter-prometheus"
+version = "0.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf0af7a0d7ced10c0151f870e5e3f3f8bc9ffc5992d32873566ca1f9169ae776"
+dependencies = [
+ "base64 0.22.1",
+ "http-body-util",
+ "hyper 1.3.1",
+ "hyper-rustls",
+ "hyper-util",
+ "indexmap 2.2.6",
  "ipnet",
- "metrics",
+ "metrics 0.23.0",
  "metrics-util",
  "quanta",
  "thiserror",
@@ -1461,25 +1979,25 @@ dependencies = [
 
 [[package]]
 name = "metrics-macros"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddece26afd34c31585c74a4db0630c376df271c285d682d1e55012197830b6df"
+checksum = "38b4faf00617defe497754acde3024865bc143d44a86799b24e191ecff91354f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "metrics-util"
-version = "0.15.1"
+version = "0.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4de2ed6e491ed114b40b732e4d1659a9d53992ebd87490c44a6ffe23739d973e"
+checksum = "4259040465c955f9f2f1a4a8a16dc46726169bca0f88e8fb2dbeced487c3e828"
 dependencies = [
  "crossbeam-epoch",
  "crossbeam-utils",
- "hashbrown 0.13.1",
- "metrics",
+ "hashbrown 0.14.5",
+ "metrics 0.23.0",
  "num_cpus",
  "quanta",
  "sketches-ddsketch",
@@ -1501,6 +2019,25 @@ dependencies = [
  "unicase",
 ]
 
+[[package]]
+name = "minijinja"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e136ef580d7955019ab0a407b68d77c292a9976907e217900f3f76bc8f6dc1a4"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "minijinja-contrib"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15ee37078c98d31e510d6a7af488031a2c3ccacdb76c5c4fc98ddfe6d0e9da07"
+dependencies = [
+ "minijinja",
+ "serde",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -1509,18 +2046,19 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.1"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
 dependencies = [
  "adler",
+ "simd-adler32",
 ]
 
 [[package]]
 name = "mio"
-version = "0.8.8"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2"
+checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "log",
@@ -1529,10 +2067,16 @@ dependencies = [
 ]
 
 [[package]]
-name = "monostate"
-version = "0.1.9"
+name = "mirai-annotations"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee"
+checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
+
+[[package]]
+name = "monostate"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d208407d7552cd041d8cdb69a1bc3303e029c598738177a3d87082004dc0e1e"
 dependencies = [
  "monostate-impl",
  "serde",
@@ -1540,20 +2084,20 @@ dependencies = [
 
 [[package]]
 name = "monostate-impl"
-version = "0.1.9"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce"
+checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "multimap"
-version = "0.8.3"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
+checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03"
 
 [[package]]
 name = "muxado"
@@ -1574,22 +2118,12 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "nanorand"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3"
-dependencies = [
- "getrandom",
-]
-
 [[package]]
 name = "native-tls"
-version = "0.2.11"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
 dependencies = [
- "lazy_static",
  "libc",
  "log",
  "openssl",
@@ -1602,26 +2136,32 @@ dependencies = [
 ]
 
 [[package]]
-name = "ngrok"
-version = "0.12.4"
+name = "new_debug_unreachable"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87e211f407b0a084f720823a00c956aeab2c15dfe7a61760d93227bbaf048026"
+checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
+
+[[package]]
+name = "ngrok"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1454b1edbc5f2c8ff3242c237cb84388b50eced8eb26b4204e49698ed6511784"
 dependencies = [
  "arc-swap",
  "async-rustls",
  "async-trait",
  "awaitdrop",
- "axum",
+ "axum 0.6.20",
  "base64 0.13.1",
  "bytes",
  "futures",
  "hostname",
- "hyper",
+ "hyper 0.14.29",
  "muxado",
  "once_cell",
  "parking_lot",
  "regex",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.4",
  "serde",
  "serde_json",
  "thiserror",
@@ -1634,16 +2174,14 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.26.2"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
+checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "cfg-if",
+ "cfg_aliases",
  "libc",
- "memoffset 0.7.1",
- "pin-utils",
- "static_assertions",
 ]
 
 [[package]]
@@ -1662,6 +2200,12 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "noop_proc_macro"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -1682,10 +2226,97 @@ dependencies = [
 ]
 
 [[package]]
-name = "num-traits"
-version = "0.2.16"
+name = "num"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c165a9ab64cf766f73521c0dd2cfdff64f488b8f0b3e621face3462d3db536d7"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-cmp"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa"
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
+[[package]]
+name = "num-derive"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.68",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
  "autocfg",
  "libm",
@@ -1703,19 +2334,13 @@ dependencies = [
 
 [[package]]
 name = "num_threads"
-version = "0.1.6"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
+checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9"
 dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "number_prefix"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
-
 [[package]]
 name = "number_prefix"
 version = "0.4.0"
@@ -1724,18 +2349,18 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "object"
-version = "0.31.1"
+version = "0.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
+checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
 
 [[package]]
 name = "onig"
@@ -1761,11 +2386,11 @@ dependencies = [
 
 [[package]]
 name = "openssl"
-version = "0.10.56"
+version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "729b745ad4a5575dd06a3e1af1414bd330ee561c01b3899eb584baeaa8def17e"
+checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -1782,7 +2407,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
@@ -1793,9 +2418,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.91"
+version = "0.9.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "866b5f16f90776b9bb8dc1e1802ac6f0513de3a7a7465867bfbc563dc737faac"
+checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
 dependencies = [
  "cc",
  "libc",
@@ -1805,81 +2430,80 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry"
-version = "0.18.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
 dependencies = [
- "opentelemetry_api 0.18.0",
- "opentelemetry_sdk 0.18.0",
+ "opentelemetry_api",
+ "opentelemetry_sdk 0.20.0",
 ]
 
 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a"
 dependencies = [
- "opentelemetry_api 0.19.0",
- "opentelemetry_sdk 0.19.0",
-]
-
-[[package]]
-name = "opentelemetry-otlp"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
-dependencies = [
- "async-trait",
- "futures",
- "futures-util",
- "http",
- "opentelemetry 0.19.0",
- "opentelemetry-proto",
- "prost",
- "thiserror",
- "tokio",
- "tonic 0.8.3",
-]
-
-[[package]]
-name = "opentelemetry-proto"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
-dependencies = [
- "futures",
- "futures-util",
- "opentelemetry 0.19.0",
- "prost",
- "tonic 0.8.3",
-]
-
-[[package]]
-name = "opentelemetry_api"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
-dependencies = [
- "fnv",
- "futures-channel",
- "futures-util",
- "indexmap 1.9.3",
+ "futures-core",
+ "futures-sink",
+ "indexmap 2.2.6",
  "js-sys",
  "once_cell",
  "pin-project-lite",
  "thiserror",
+ "urlencoding",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
+dependencies = [
+ "async-trait",
+ "futures-core",
+ "http 0.2.12",
+ "opentelemetry-proto",
+ "opentelemetry-semantic-conventions",
+ "opentelemetry_api",
+ "opentelemetry_sdk 0.20.0",
+ "prost 0.11.9",
+ "thiserror",
+ "tokio",
+ "tonic 0.9.2",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk 0.20.0",
+ "prost 0.11.9",
+ "tonic 0.9.2",
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
+dependencies = [
+ "opentelemetry 0.20.0",
 ]
 
 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
 dependencies = [
- "fnv",
  "futures-channel",
  "futures-util",
  "indexmap 1.9.3",
+ "js-sys",
  "once_cell",
  "pin-project-lite",
  "thiserror",
@@ -1888,21 +2512,22 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.18.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
 dependencies = [
  "async-trait",
  "crossbeam-channel",
- "dashmap",
- "fnv",
  "futures-channel",
  "futures-executor",
  "futures-util",
  "once_cell",
- "opentelemetry_api 0.18.0",
+ "opentelemetry_api",
+ "ordered-float 3.9.2",
  "percent-encoding",
  "rand",
+ "regex",
+ "serde_json",
  "thiserror",
  "tokio",
  "tokio-stream",
@@ -1910,24 +2535,46 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.21.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "2f16aec8a98a457a52664d69e0091bac3a0abd18ead9b641cb00202ba4e0efe4"
 dependencies = [
  "async-trait",
  "crossbeam-channel",
- "dashmap",
- "fnv",
  "futures-channel",
  "futures-executor",
  "futures-util",
+ "glob",
  "once_cell",
- "opentelemetry_api 0.19.0",
+ "opentelemetry 0.21.0",
+ "ordered-float 4.2.0",
  "percent-encoding",
  "rand",
  "thiserror",
- "tokio",
- "tokio-stream",
+]
+
+[[package]]
+name = "option-ext"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ordered-float"
+version = "4.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
+dependencies = [
+ "num-traits",
 ]
 
 [[package]]
@@ -1938,9 +2585,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 
 [[package]]
 name = "papergrid"
-version = "0.9.1"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae7891b22598926e4398790c8fe6447930c72a67d36d983a49d6ce682ce83290"
+checksum = "a2ccbe15f2b6db62f9a9871642746427e297b0ceb85f9a7f1ee5ff47d184d0c8"
 dependencies = [
  "bytecount",
  "fnv",
@@ -1949,9 +2596,9 @@ dependencies = [
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -1959,87 +2606,64 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.8"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.3.5",
+ "redox_syscall",
  "smallvec",
- "windows-targets 0.48.1",
-]
-
-[[package]]
-name = "password-hash"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700"
-dependencies = [
- "base64ct",
- "rand_core",
- "subtle",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
 name = "paste"
-version = "1.0.14"
+version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
-
-[[package]]
-name = "pbkdf2"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917"
-dependencies = [
- "digest",
- "hmac",
- "password-hash",
- "sha2",
-]
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "percent-encoding"
-version = "2.3.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
 [[package]]
 name = "petgraph"
-version = "0.6.3"
+version = "0.6.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
+checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset",
- "indexmap 1.9.3",
+ "indexmap 2.2.6",
 ]
 
 [[package]]
 name = "pin-project"
-version = "1.1.3"
+version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422"
+checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.3"
+version = "1.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405"
+checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.12"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cc1b0bf1727a77a54b6654e7b5f1af8604923edc8b81885f8ec92f9e3f0a05"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
 
 [[package]]
 name = "pin-utils"
@@ -2049,15 +2673,34 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.27"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
+checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec"
+
+[[package]]
+name = "png"
+version = "0.17.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06e4b0d3d1312775e782c86c91a111aa1f910cbb65e1337f9975b5f9a554b5e1"
+dependencies = [
+ "bitflags 1.3.2",
+ "crc32fast",
+ "fdeflate",
+ "flate2",
+ "miniz_oxide",
+]
 
 [[package]]
 name = "portable-atomic"
-version = "1.4.2"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f32154ba0af3a075eefa1eda8bb414ee928f62303a54ea85b8d6638ff1a6ee9e"
+checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
 
 [[package]]
 name = "ppv-lite86"
@@ -2067,12 +2710,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "prettyplease"
-version = "0.1.25"
+version = "0.2.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86"
+checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e"
 dependencies = [
  "proc-macro2",
- "syn 1.0.109",
+ "syn 2.0.68",
 ]
 
 [[package]]
@@ -2101,13 +2744,32 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.66"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "profiling"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43d84d1d7a6ac92673717f9f6d1518374ef257669c24ebc5ac25d5033828be58"
+dependencies = [
+ "profiling-procmacros",
+]
+
+[[package]]
+name = "profiling-procmacros"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd"
+dependencies = [
+ "quote",
+ "syn 2.0.68",
+]
+
 [[package]]
 name = "prost"
 version = "0.11.9"
@@ -2115,29 +2777,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
 dependencies = [
  "bytes",
- "prost-derive",
+ "prost-derive 0.11.9",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29"
+dependencies = [
+ "bytes",
+ "prost-derive 0.12.6",
 ]
 
 [[package]]
 name = "prost-build"
-version = "0.11.9"
+version = "0.12.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
+checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
  "bytes",
- "heck",
- "itertools 0.10.5",
- "lazy_static",
+ "heck 0.5.0",
+ "itertools 0.12.1",
  "log",
  "multimap",
+ "once_cell",
  "petgraph",
  "prettyplease",
- "prost",
+ "prost 0.12.6",
  "prost-types",
  "regex",
- "syn 1.0.109",
+ "syn 2.0.68",
  "tempfile",
- "which",
 ]
 
 [[package]]
@@ -2154,23 +2825,44 @@ dependencies = [
 ]
 
 [[package]]
-name = "prost-types"
-version = "0.11.9"
+name = "prost-derive"
+version = "0.12.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
+checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
 dependencies = [
- "prost",
+ "anyhow",
+ "itertools 0.12.1",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.68",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0"
+dependencies = [
+ "prost 0.12.6",
+]
+
+[[package]]
+name = "qoi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001"
+dependencies = [
+ "bytemuck",
 ]
 
 [[package]]
 name = "quanta"
-version = "0.11.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17e662a7a8291a865152364c20c7abc5e60486ab2001e8ec10b24862de0b9ab"
+checksum = "8e5167a477619228a0b284fac2674e3c388cba90631d7b7de620e6f1fcd08da5"
 dependencies = [
  "crossbeam-utils",
  "libc",
- "mach2",
  "once_cell",
  "raw-cpuid",
  "wasi",
@@ -2179,10 +2871,16 @@ dependencies = [
 ]
 
 [[package]]
-name = "quote"
-version = "1.0.32"
+name = "quick-error"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
+checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
+
+[[package]]
+name = "quote"
+version = "1.0.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
@@ -2219,31 +2917,85 @@ dependencies = [
 
 [[package]]
 name = "ratatui"
-version = "0.20.1"
+version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcc0d032bccba900ee32151ec0265667535c230169f5a011154cdcd984e16829"
+checksum = "2e2e4cd95294a85c3b4446e63ef054eea43e0205b1fd60120c16b74ff7ff96ad"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "cassowary",
  "crossterm",
+ "indoc",
+ "itertools 0.11.0",
+ "paste",
+ "strum",
  "unicode-segmentation",
  "unicode-width",
 ]
 
 [[package]]
-name = "raw-cpuid"
-version = "10.7.0"
+name = "rav1e"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332"
+checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9"
 dependencies = [
- "bitflags 1.3.2",
+ "arbitrary",
+ "arg_enum_proc_macro",
+ "arrayvec",
+ "av1-grain",
+ "bitstream-io",
+ "built",
+ "cfg-if",
+ "interpolate_name",
+ "itertools 0.12.1",
+ "libc",
+ "libfuzzer-sys",
+ "log",
+ "maybe-rayon",
+ "new_debug_unreachable",
+ "noop_proc_macro",
+ "num-derive",
+ "num-traits",
+ "once_cell",
+ "paste",
+ "profiling",
+ "rand",
+ "rand_chacha",
+ "simd_helpers",
+ "system-deps",
+ "thiserror",
+ "v_frame",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "ravif"
+version = "0.11.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67376f469e7e7840d0040bbf4b9b3334005bb167f814621326e4c7ab8cd6e944"
+dependencies = [
+ "avif-serialize",
+ "imgref",
+ "loop9",
+ "quick-error",
+ "rav1e",
+ "rayon",
+ "rgb",
+]
+
+[[package]]
+name = "raw-cpuid"
+version = "11.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e29830cbb1290e404f24c73af91c5d8d631ce7e128691e9477556b540cd01ecd"
+dependencies = [
+ "bitflags 2.6.0",
 ]
 
 [[package]]
 name = "rayon"
-version = "1.7.0"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
 dependencies = [
  "either",
  "rayon-core",
@@ -2251,66 +3003,55 @@ dependencies = [
 
 [[package]]
 name = "rayon-cond"
-version = "0.1.0"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd1259362c9065e5ea39a789ef40b1e3fd934c94beb7b5ab3ac6629d3b5e7cb7"
+checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
 dependencies = [
  "either",
- "itertools 0.8.2",
+ "itertools 0.11.0",
  "rayon",
 ]
 
 [[package]]
 name = "rayon-core"
-version = "1.11.0"
+version = "1.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 dependencies = [
- "crossbeam-channel",
  "crossbeam-deque",
  "crossbeam-utils",
- "num_cpus",
 ]
 
 [[package]]
 name = "redox_syscall"
-version = "0.2.16"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
 dependencies = [
- "bitflags 1.3.2",
-]
-
-[[package]]
-name = "redox_syscall"
-version = "0.3.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
-dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
 ]
 
 [[package]]
 name = "redox_users"
-version = "0.4.3"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
+checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891"
 dependencies = [
  "getrandom",
- "redox_syscall 0.2.16",
+ "libredox",
  "thiserror",
 ]
 
 [[package]]
 name = "regex"
-version = "1.9.3"
+version = "1.10.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a"
+checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f"
 dependencies = [
- "aho-corasick 1.0.3",
+ "aho-corasick",
  "memchr",
- "regex-automata 0.3.6",
- "regex-syntax 0.7.4",
+ "regex-automata 0.4.7",
+ "regex-syntax 0.8.4",
 ]
 
 [[package]]
@@ -2324,13 +3065,13 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.3.6"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69"
+checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
 dependencies = [
- "aho-corasick 1.0.3",
+ "aho-corasick",
  "memchr",
- "regex-syntax 0.7.4",
+ "regex-syntax 0.8.4",
 ]
 
 [[package]]
@@ -2341,25 +3082,25 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
 name = "regex-syntax"
-version = "0.7.4"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
+checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
 
 [[package]]
 name = "reqwest"
-version = "0.11.18"
+version = "0.11.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55"
+checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62"
 dependencies = [
- "base64 0.21.2",
+ "base64 0.21.7",
  "bytes",
  "encoding_rs",
  "futures-core",
  "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.29",
  "hyper-tls",
  "ipnet",
  "js-sys",
@@ -2369,9 +3110,12 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
+ "rustls-pemfile 1.0.4",
  "serde",
  "serde_json",
  "serde_urlencoded",
+ "sync_wrapper 0.1.2",
+ "system-configuration",
  "tokio",
  "tokio-native-tls",
  "tower-service",
@@ -2382,6 +3126,15 @@ dependencies = [
  "winreg",
 ]
 
+[[package]]
+name = "rgb"
+version = "0.8.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05aaa8004b64fd573fc9d002f4e632d51ad4f026c2b5ba95fcb6c2f32c2c47d8"
+dependencies = [
+ "bytemuck",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -2392,16 +3145,31 @@ dependencies = [
  "libc",
  "once_cell",
  "spin 0.5.2",
- "untrusted",
+ "untrusted 0.7.1",
  "web-sys",
  "winapi",
 ]
 
 [[package]]
-name = "rust-embed"
-version = "6.8.1"
+name = "ring"
+version = "0.17.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a36224c3276f8c4ebc8c20f158eca7ca4359c8db89991c4925132aaaf6702661"
+checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom",
+ "libc",
+ "spin 0.9.8",
+ "untrusted 0.9.0",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rust-embed"
+version = "8.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19549741604902eb99a7ed0ee177a0663ee1eda51a29f71401f166e47e77806a"
 dependencies = [
  "rust-embed-impl",
  "rust-embed-utils",
@@ -2410,23 +3178,22 @@ dependencies = [
 
 [[package]]
 name = "rust-embed-impl"
-version = "6.8.1"
+version = "8.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49b94b81e5b2c284684141a2fb9e2a31be90638caf040bf9afbc5a0416afe1ac"
+checksum = "cb9f96e283ec64401f30d3df8ee2aaeb2561f34c824381efa24a35f79bf40ee4"
 dependencies = [
  "proc-macro2",
  "quote",
  "rust-embed-utils",
- "shellexpand",
- "syn 2.0.28",
+ "syn 2.0.68",
  "walkdir",
 ]
 
 [[package]]
 name = "rust-embed-utils"
-version = "7.8.1"
+version = "8.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d38ff6bf570dc3bb7100fce9f7b60c33fa71d80e88da3f2580df4ff2bdded74"
+checksum = "38c74a686185620830701348de757fd36bef4aa9680fd23c49fc539ddcc1af32"
 dependencies = [
  "sha2",
  "walkdir",
@@ -2434,9 +3201,15 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustc_version"
@@ -2449,49 +3222,119 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.8"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
- "bitflags 2.4.0",
+ "bitflags 2.6.0",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.20.8"
+version = "0.20.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f"
+checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99"
 dependencies = [
  "log",
- "ring",
+ "ring 0.16.20",
  "sct",
  "webpki",
 ]
 
 [[package]]
-name = "rustls-pemfile"
-version = "1.0.3"
+name = "rustls"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d3987094b1d07b653b7dfdc3f70ce9a1da9c51ac18c1b06b662e4f9a0e9f4b2"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
- "base64 0.21.2",
+ "log",
+ "ring 0.17.8",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05cff451f60db80f490f3c182b77c35260baace73209e9cdbbe526bfe3a4d402"
+dependencies = [
+ "aws-lc-rs",
+ "log",
+ "once_cell",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile 2.1.2",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
+dependencies = [
+ "base64 0.21.7",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d"
+dependencies = [
+ "base64 0.22.1",
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d"
+
+[[package]]
+name = "rustls-webpki"
+version = "0.102.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff448f7e92e913c4b7d4c6d8e4540a1724b319b4152b8aef6d4cf8339712b33e"
+dependencies = [
+ "aws-lc-rs",
+ "ring 0.17.8",
+ "rustls-pki-types",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
 name = "rustversion"
-version = "1.0.14"
+version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
+checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
 
 [[package]]
 name = "ryu"
-version = "1.0.15"
+version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741"
+checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
 
 [[package]]
 name = "same-file"
@@ -2504,11 +3347,11 @@ dependencies = [
 
 [[package]]
 name = "schannel"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2519,21 +3362,21 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "sct"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring",
- "untrusted",
+ "ring 0.17.8",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
 name = "security-framework"
-version = "2.9.2"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -2542,9 +3385,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.9.1"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -2552,35 +3395,38 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.18"
+version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918"
+checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "serde_json"
-version = "1.0.104"
+version = "1.0.118"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c"
+checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4"
 dependencies = [
  "itoa",
  "ryu",
@@ -2589,14 +3435,23 @@ dependencies = [
 
 [[package]]
 name = "serde_path_to_error"
-version = "0.1.14"
+version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
+checksum = "af99884400da37c88f5e9146b7f1fd0fbcae8f6eec4e9da38b67d05486f814a6"
 dependencies = [
  "itoa",
  "serde",
 ]
 
+[[package]]
+name = "serde_spanned"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "serde_urlencoded"
 version = "0.7.1"
@@ -2609,22 +3464,11 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "sha1"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "digest",
-]
-
 [[package]]
 name = "sha2"
-version = "0.10.7"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -2633,21 +3477,18 @@ dependencies = [
 
 [[package]]
 name = "sharded-slab"
-version = "0.1.4"
+version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
 dependencies = [
  "lazy_static",
 ]
 
 [[package]]
-name = "shellexpand"
-version = "2.1.2"
+name = "shlex"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ccc8076840c4da029af4f87e4e8daeb0fca6b87bbb02e10cb60b791450e11e4"
-dependencies = [
- "dirs",
-]
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "signal-hook"
@@ -2672,61 +3513,66 @@ dependencies = [
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.1"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
+checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
 dependencies = [
  "libc",
 ]
 
 [[package]]
-name = "sketches-ddsketch"
-version = "0.2.1"
+name = "simd-adler32"
+version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1"
+checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
+
+[[package]]
+name = "simd_helpers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6"
+dependencies = [
+ "quote",
+]
+
+[[package]]
+name = "sketches-ddsketch"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
 
 [[package]]
 name = "slab"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
 dependencies = [
  "autocfg",
 ]
 
 [[package]]
 name = "slotmap"
-version = "1.0.6"
+version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1e08e261d0e8f5c43123b7adf3e4ca1690d655377ac93a03b2c9d3e98de1342"
+checksum = "dbff4acf519f630b3a3ddcfaea6c06b42174d9a44bc70c620e9ed1649d58b82a"
 dependencies = [
  "version_check",
 ]
 
 [[package]]
 name = "smallvec"
-version = "1.11.0"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
 [[package]]
 name = "socket2"
-version = "0.4.9"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
 dependencies = [
  "libc",
- "winapi",
-]
-
-[[package]]
-name = "socket2"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
-dependencies = [
- "libc",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -2757,22 +3603,38 @@ dependencies = [
 ]
 
 [[package]]
-name = "static_assertions"
-version = "1.1.0"
+name = "strsim"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
 [[package]]
-name = "strsim"
-version = "0.10.0"
+name = "strum"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
+dependencies = [
+ "heck 0.4.1",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.68",
+]
 
 [[package]]
 name = "subtle"
-version = "2.5.0"
+version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
 [[package]]
 name = "syn"
@@ -2787,9 +3649,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.28"
+version = "2.0.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
+checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2803,24 +3665,64 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
 [[package]]
-name = "sysinfo"
-version = "0.29.8"
+name = "sync_wrapper"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d10ed79c22663a35a255d289a7fdcb43559fc77ff15df5ce6c341809e7867528"
+checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
+
+[[package]]
+name = "sysinfo"
+version = "0.30.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae"
 dependencies = [
  "cfg-if",
  "core-foundation-sys",
  "libc",
  "ntapi",
  "once_cell",
- "winapi",
+ "windows",
+]
+
+[[package]]
+name = "system-configuration"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "system-deps"
+version = "6.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3e535eb8dded36d55ec13eddacd30dec501792ff23a0b1682c38601b8cf2349"
+dependencies = [
+ "cfg-expr",
+ "heck 0.5.0",
+ "pkg-config",
+ "toml",
+ "version-compare",
 ]
 
 [[package]]
 name = "tabled"
-version = "0.12.2"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce69a5028cd9576063ec1f48edb2c75339fd835e6094ef3e05b3a079bf594a6"
+checksum = "dfe9c3632da101aba5131ed63f9eed38665f8b3c68703a6bb18124835c1a5d22"
 dependencies = [
  "papergrid",
  "tabled_derive",
@@ -2833,7 +3735,7 @@ version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro-error",
  "proc-macro2",
  "quote",
@@ -2841,37 +3743,32 @@ dependencies = [
 ]
 
 [[package]]
-name = "tar"
-version = "0.4.40"
+name = "target-lexicon"
+version = "0.12.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
-dependencies = [
- "filetime",
- "libc",
- "xattr",
-]
+checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f"
 
 [[package]]
 name = "tempfile"
-version = "3.7.1"
+version = "3.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
 dependencies = [
  "cfg-if",
  "fastrand",
- "redox_syscall 0.3.5",
  "rustix",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
 name = "text-generation-benchmark"
-version = "1.0.1"
+version = "2.1.1-dev0"
 dependencies = [
  "average",
  "clap",
  "crossterm",
  "float-ord",
+ "hf-hub",
  "ratatui",
  "serde",
  "serde_json",
@@ -2886,15 +3783,17 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "1.0.1"
+version = "2.1.1-dev0"
 dependencies = [
+ "async-trait",
+ "base64 0.22.1",
  "futures",
  "grpc-metadata",
- "prost",
+ "prost 0.12.6",
  "prost-build",
  "thiserror",
  "tokio",
- "tonic 0.9.2",
+ "tonic 0.10.2",
  "tonic-build",
  "tower",
  "tracing",
@@ -2902,15 +3801,18 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "1.0.1"
+version = "2.1.1-dev0"
 dependencies = [
  "clap",
  "ctrlc",
  "float_eq",
+ "hf-hub",
  "nix",
+ "once_cell",
  "reqwest",
  "serde",
  "serde_json",
+ "thiserror",
  "tracing",
  "tracing-subscriber",
  "vergen",
@@ -2918,21 +3820,31 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "1.0.1"
+version = "2.1.1-dev0"
 dependencies = [
  "async-stream",
- "axum",
+ "axum 0.7.5",
  "axum-tracing-opentelemetry",
+ "base64 0.22.1",
  "clap",
- "flume",
  "futures",
- "metrics",
+ "futures-util",
+ "hf-hub",
+ "image",
+ "init-tracing-opentelemetry",
+ "itertools 0.10.5",
+ "jsonschema",
+ "metrics 0.21.1",
  "metrics-exporter-prometheus",
+ "minijinja",
+ "minijinja-contrib",
  "ngrok",
  "nohash-hasher",
- "opentelemetry 0.19.0",
+ "once_cell",
+ "opentelemetry 0.20.0",
  "opentelemetry-otlp",
  "rand",
+ "regex",
  "reqwest",
  "serde",
  "serde_json",
@@ -2940,9 +3852,10 @@ dependencies = [
  "thiserror",
  "tokenizers",
  "tokio",
- "tower-http 0.4.3",
+ "tokio-stream",
+ "tower-http",
  "tracing",
- "tracing-opentelemetry 0.19.0",
+ "tracing-opentelemetry 0.21.0",
  "tracing-subscriber",
  "utoipa",
  "utoipa-swagger-ui",
@@ -2951,44 +3864,57 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.44"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "611040a08a0439f8248d1990b111c95baa9c704c805fa1f62104b39655fd7f90"
+checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.44"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "090198534930841fab3a5d1bb637cde49e339654e606195f8d9c76eeb081dc96"
+checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "thread_local"
-version = "1.1.7"
+version = "1.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
 dependencies = [
  "cfg-if",
  "once_cell",
 ]
 
 [[package]]
-name = "time"
-version = "0.3.25"
+name = "tiff"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fdd63d58b18d663fbdf70e049f00a22c8e42be082203be7f26589213cd75ea"
+checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e"
+dependencies = [
+ "flate2",
+ "jpeg-decoder",
+ "weezl",
+]
+
+[[package]]
+name = "time"
+version = "0.3.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
  "deranged",
  "itoa",
  "libc",
+ "num-conv",
  "num_threads",
+ "powerfmt",
  "serde",
  "time-core",
  "time-macros",
@@ -2996,24 +3922,25 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.11"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb71511c991639bb078fd5bf97757e03914361c48100d52878b8e52b46fb92cd"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
  "time-core",
 ]
 
 [[package]]
 name = "tinyvec"
-version = "1.6.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -3026,19 +3953,17 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokenizers"
-version = "0.13.4"
+version = "0.19.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aea68938177975ab09da68552b720eac941779ff386baceaf77e0f5f9cea645f"
+checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
 dependencies = [
- "aho-corasick 0.7.20",
- "cached-path",
- "clap",
+ "aho-corasick",
  "derive_builder",
- "dirs",
  "esaxx-rs",
  "getrandom",
- "indicatif 0.15.0",
- "itertools 0.9.0",
+ "hf-hub",
+ "indicatif",
+ "itertools 0.12.1",
  "lazy_static",
  "log",
  "macro_rules_attribute",
@@ -3049,8 +3974,7 @@ dependencies = [
  "rayon",
  "rayon-cond",
  "regex",
- "regex-syntax 0.7.4",
- "reqwest",
+ "regex-syntax 0.8.4",
  "serde",
  "serde_json",
  "spm_precompiled",
@@ -3062,9 +3986,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.31.0"
+version = "1.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40de3a2ba249dcb097e01be5e67a5ff53cf250397715a071a81543e8a832a920"
+checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
 dependencies = [
  "backtrace",
  "bytes",
@@ -3074,7 +3998,7 @@ dependencies = [
  "parking_lot",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2 0.5.3",
+ "socket2",
  "tokio-macros",
  "windows-sys 0.48.0",
 ]
@@ -3091,13 +4015,13 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.1.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
+checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
@@ -3122,10 +4046,21 @@ dependencies = [
 ]
 
 [[package]]
-name = "tokio-stream"
-version = "0.1.14"
+name = "tokio-rustls"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
+checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+dependencies = [
+ "rustls 0.23.10",
+ "rustls-pki-types",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -3134,9 +4069,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.8"
+version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
+checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
 dependencies = [
  "bytes",
  "futures-core",
@@ -3144,39 +4079,40 @@ dependencies = [
  "futures-sink",
  "pin-project-lite",
  "tokio",
- "tracing",
 ]
 
 [[package]]
-name = "tonic"
-version = "0.8.3"
+name = "toml"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
+checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335"
 dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.13.1",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost",
- "prost-derive",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit",
+]
+
+[[package]]
+name = "toml_datetime"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "toml_edit"
+version = "0.22.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
+dependencies = [
+ "indexmap 2.2.6",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow",
 ]
 
 [[package]]
@@ -3186,19 +4122,46 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
 dependencies = [
  "async-trait",
- "axum",
- "base64 0.21.2",
+ "axum 0.6.20",
+ "base64 0.21.7",
  "bytes",
  "futures-core",
  "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.29",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
- "prost",
+ "prost 0.11.9",
+ "tokio",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum 0.6.20",
+ "base64 0.21.7",
+ "bytes",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.29",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost 0.12.6",
  "tokio",
  "tokio-stream",
  "tower",
@@ -3209,15 +4172,15 @@ dependencies = [
 
 [[package]]
 name = "tonic-build"
-version = "0.9.2"
+version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07"
+checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889"
 dependencies = [
  "prettyplease",
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.68",
 ]
 
 [[package]]
@@ -3242,36 +4205,15 @@ dependencies = [
 
 [[package]]
 name = "tower-http"
-version = "0.3.5"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858"
+checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "bytes",
- "futures-core",
- "futures-util",
- "http",
- "http-body",
- "http-range-header",
- "pin-project-lite",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
-[[package]]
-name = "tower-http"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55ae70283aba8d2a8b411c695c437fe25b8b5e44e23e780662002fc72fb47a82"
-dependencies = [
- "bitflags 2.4.0",
- "bytes",
- "futures-core",
- "futures-util",
- "http",
- "http-body",
- "http-range-header",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
  "pin-project-lite",
  "tower-layer",
  "tower-service",
@@ -3291,11 +4233,10 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
 [[package]]
 name = "tracing"
-version = "0.1.37"
+version = "0.1.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
 dependencies = [
- "cfg-if",
  "log",
  "pin-project-lite",
  "tracing-attributes",
@@ -3304,72 +4245,91 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.26"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.31"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
 dependencies = [
  "once_cell",
  "valuable",
 ]
 
 [[package]]
-name = "tracing-futures"
-version = "0.2.5"
+name = "tracing-log"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
+checksum = "f751112709b4e791d8ce53e32c4ed2d353565a795ce84da2285393f41557bdf2"
 dependencies = [
- "pin-project",
- "tracing",
+ "log",
+ "once_cell",
+ "tracing-core",
 ]
 
 [[package]]
 name = "tracing-log"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
 dependencies = [
- "lazy_static",
  "log",
+ "once_cell",
  "tracing-core",
 ]
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.18.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
 dependencies = [
  "once_cell",
- "opentelemetry 0.18.0",
+ "opentelemetry 0.20.0",
+ "opentelemetry_sdk 0.20.0",
+ "smallvec",
  "tracing",
  "tracing-core",
- "tracing-log",
+ "tracing-log 0.1.4",
  "tracing-subscriber",
 ]
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "c67ac25c5407e7b961fafc6f7e9aa5958fd297aada2d20fa2ae1737357e55596"
 dependencies = [
+ "js-sys",
  "once_cell",
- "opentelemetry 0.19.0",
+ "opentelemetry 0.21.0",
+ "opentelemetry_sdk 0.21.2",
+ "smallvec",
  "tracing",
  "tracing-core",
- "tracing-log",
+ "tracing-log 0.2.0",
  "tracing-subscriber",
+ "web-time",
+]
+
+[[package]]
+name = "tracing-opentelemetry-instrumentation-sdk"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9920abb6a3ee3a2af7d30c9ff02900f8481935d36723c3da95cf807468218e8c"
+dependencies = [
+ "http 1.1.0",
+ "opentelemetry 0.21.0",
+ "tracing",
+ "tracing-opentelemetry 0.22.0",
 ]
 
 [[package]]
@@ -3384,9 +4344,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.17"
+version = "0.3.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
 dependencies = [
  "matchers",
  "nu-ansi-term",
@@ -3399,48 +4359,48 @@ dependencies = [
  "thread_local",
  "tracing",
  "tracing-core",
- "tracing-log",
+ "tracing-log 0.2.0",
  "tracing-serde",
 ]
 
 [[package]]
 name = "try-lock"
-version = "0.2.4"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
 [[package]]
 name = "typenum"
-version = "1.16.0"
+version = "1.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
 
 [[package]]
 name = "unicase"
-version = "2.6.0"
+version = "2.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
 dependencies = [
  "version_check",
 ]
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.11"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
 dependencies = [
  "tinyvec",
 ]
@@ -3456,15 +4416,15 @@ dependencies = [
 
 [[package]]
 name = "unicode-segmentation"
-version = "1.10.1"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
+checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
 
 [[package]]
 name = "unicode-width"
-version = "0.1.10"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
 
 [[package]]
 name = "unicode_categories"
@@ -3479,10 +4439,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
 [[package]]
-name = "url"
-version = "2.4.0"
+name = "untrusted"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "ureq"
+version = "2.9.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
+dependencies = [
+ "base64 0.22.1",
+ "flate2",
+ "log",
+ "native-tls",
+ "once_cell",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "serde",
+ "serde_json",
+ "url",
+ "webpki-roots",
+]
+
+[[package]]
+name = "url"
+version = "2.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -3497,17 +4483,17 @@ checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
 
 [[package]]
 name = "utf8parse"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
 [[package]]
 name = "utoipa"
-version = "3.4.4"
+version = "4.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de634b7f8178c9c246c88ea251f3a0215c9a4d80778db2d7bd4423a78b5170ec"
+checksum = "c5afb1a60e207dca502682537fefcfd9921e71d0b83e9576060f09abc6efab23"
 dependencies = [
- "indexmap 2.0.0",
+ "indexmap 2.2.6",
  "serde",
  "serde_json",
  "utoipa-gen",
@@ -3515,24 +4501,24 @@ dependencies = [
 
 [[package]]
 name = "utoipa-gen"
-version = "3.4.5"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fcba79cb3e5020d9bcc8313cd5aadaf51d6d54a6b3fd08c3d0360ae6b3c83d0"
+checksum = "7bf0e16c02bc4bf5322ab65f10ab1149bdbcaa782cba66dc7057370a3f8190be"
 dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
  "regex",
- "syn 2.0.28",
+ "syn 2.0.68",
 ]
 
 [[package]]
 name = "utoipa-swagger-ui"
-version = "3.1.5"
+version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84614caa239fb25b2bb373a52859ffd94605ceb256eeb1d63436325cf81e3653"
+checksum = "0b39868d43c011961e04b41623e050aedf2cc93652562ff7935ce0f819aaf2da"
 dependencies = [
- "axum",
+ "axum 0.7.5",
  "mime_guess",
  "regex",
  "rust-embed",
@@ -3542,6 +4528,23 @@ dependencies = [
  "zip",
 ]
 
+[[package]]
+name = "uuid"
+version = "1.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439"
+
+[[package]]
+name = "v_frame"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6f32aaa24bacd11e488aa9ba66369c7cd514885742c9fe08cfe85884db3e92b"
+dependencies = [
+ "aligned-vec",
+ "num-traits",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "valuable"
 version = "0.1.0"
@@ -3556,17 +4559,26 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
 
 [[package]]
 name = "vergen"
-version = "8.2.4"
+version = "8.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbc5ad0d9d26b2c49a5ab7da76c3e79d3ee37e7821799f8223fcb8f2f391a2e7"
+checksum = "e27d6bdd219887a9eadd19e1c34f32e47fa332301184935c6d9bca26f3cca525"
 dependencies = [
  "anyhow",
+ "cargo_metadata",
+ "cfg-if",
+ "regex",
  "rustc_version",
  "rustversion",
  "sysinfo",
  "time",
 ]
 
+[[package]]
+name = "version-compare"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b"
+
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -3575,9 +4587,9 @@ checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
  "same-file",
  "winapi-util",
@@ -3600,9 +4612,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -3610,24 +4622,24 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
 dependencies = [
  "bumpalo",
  "log",
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.37"
+version = "0.4.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03"
+checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -3637,9 +4649,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -3647,28 +4659,38 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.28",
+ "syn 2.0.68",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.87"
+version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
 [[package]]
 name = "web-sys"
-version = "0.3.64"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
+checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa30049b1c872b72c89866d458eae9f20380ab280ffd1b1e18df2d3e2d98cfe0"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -3676,23 +4698,39 @@ dependencies = [
 
 [[package]]
 name = "webpki"
-version = "0.22.0"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd"
+checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
 dependencies = [
- "ring",
- "untrusted",
+ "ring 0.17.8",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
-name = "which"
-version = "4.4.0"
+name = "webpki-roots"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269"
+checksum = "bd7c23921eeb1713a4e851530e9b9756e4fb0e89978582942612524cf09f01cd"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "weezl"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
+
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
 dependencies = [
  "either",
- "libc",
+ "home",
  "once_cell",
+ "rustix",
 ]
 
 [[package]]
@@ -3713,11 +4751,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
 [[package]]
 name = "winapi-util"
-version = "0.1.5"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b"
 dependencies = [
- "winapi",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -3726,6 +4764,25 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.52.5",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.5",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.45.0"
@@ -3741,7 +4798,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets 0.48.1",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -3761,17 +4827,33 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.48.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm 0.48.0",
- "windows_aarch64_msvc 0.48.0",
- "windows_i686_gnu 0.48.0",
- "windows_i686_msvc 0.48.0",
- "windows_x86_64_gnu 0.48.0",
- "windows_x86_64_gnullvm 0.48.0",
- "windows_x86_64_msvc 0.48.0",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.5",
+ "windows_aarch64_msvc 0.52.5",
+ "windows_i686_gnu 0.52.5",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.5",
+ "windows_x86_64_gnu 0.52.5",
+ "windows_x86_64_gnullvm 0.52.5",
+ "windows_x86_64_msvc 0.52.5",
 ]
 
 [[package]]
@@ -3782,9 +4864,15 @@ checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -3794,9 +4882,15 @@ checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -3806,9 +4900,21 @@ checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -3818,9 +4924,15 @@ checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -3830,9 +4942,15 @@ checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -3842,9 +4960,15 @@ checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -3854,26 +4978,73 @@ checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.48.0"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
-name = "winreg"
-version = "0.10.1"
+name = "windows_x86_64_msvc"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
+
+[[package]]
+name = "winnow"
+version = "0.6.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1"
 dependencies = [
- "winapi",
+ "memchr",
 ]
 
 [[package]]
-name = "xattr"
-version = "1.0.1"
+name = "winreg"
+version = "0.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4686009f71ff3e5c4dbcf1a282d0a44db3f021ba69350cd42086b3e5f1c6985"
+checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
 dependencies = [
- "libc",
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.7.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.68",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
+dependencies = [
+ "zeroize_derive",
+]
+
+[[package]]
+name = "zeroize_derive"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.68",
 ]
 
 [[package]]
@@ -3882,46 +5053,32 @@ version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
 dependencies = [
- "aes",
  "byteorder",
- "bzip2",
- "constant_time_eq",
  "crc32fast",
  "crossbeam-utils",
  "flate2",
- "hmac",
- "pbkdf2",
- "sha1",
- "time",
- "zstd",
 ]
 
 [[package]]
-name = "zstd"
-version = "0.11.2+zstd.1.5.2"
+name = "zune-core"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
+checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a"
+
+[[package]]
+name = "zune-inflate"
+version = "0.2.54"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02"
 dependencies = [
- "zstd-safe",
+ "simd-adler32",
 ]
 
 [[package]]
-name = "zstd-safe"
-version = "5.0.2+zstd.1.5.2"
+name = "zune-jpeg"
+version = "0.4.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
+checksum = "ec866b44a2a1fd6133d363f073ca1b179f438f99e7e5bfb1e33f7181facfe448"
 dependencies = [
- "libc",
- "zstd-sys",
-]
-
-[[package]]
-name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
+ "zune-core",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 9f526b27..3866a8b3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,15 +6,32 @@ members = [
     "router/grpc-metadata",
     "launcher"
 ]
+resolver = "2"
 
 [workspace.package]
-version = "1.0.3"
+version = "2.1.2-dev0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
 
+[workspace.dependencies]
+base64 = "0.22.0"
+tokenizers = { version = "0.19.1", features = ["http"] }
+hf-hub = { version = "0.3.1", features = ["tokio"] }
+
 [profile.release]
+incremental = true
+
+[profile.release-binary]
+inherits = "release"
 debug = 1
 incremental = true
-lto = "off"
 panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
+lto = "fat"
+opt-level = 3
+codegen-units = 1
diff --git a/Dockerfile b/Dockerfile
index 45e304c4..d4772b4a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,11 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
 
-FROM chef as planner
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
@@ -15,9 +16,6 @@ RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
-ARG GIT_SHA
-ARG DOCKER_LABEL
-
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -25,7 +23,10 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     rm -f $PROTOC_ZIP
 
 COPY --from=planner /usr/src/recipe.json recipe.json
-RUN cargo chef cook --release --recipe-path recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
 
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
@@ -33,17 +34,17 @@ COPY proto proto
 COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM debian:bullseye-slim as pytorch-install
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
 
-ARG PYTORCH_VERSION=2.0.1
-ARG PYTHON_VERSION=3.9
+ARG PYTORCH_VERSION=2.3.0
+ARG PYTHON_VERSION=3.10
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=11.8
-ARG MAMBA_VERSION=23.1.0-1
+ARG CUDA_VERSION=12.1
+ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
@@ -75,22 +76,21 @@ RUN chmod +x ~/mambaforge.sh && \
 RUN case ${TARGETPLATFORM} in \
          "linux/arm64")  exit 1 ;; \
          *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
     esac && \
     /opt/conda/bin/conda clean -ya
 
 # CUDA kernels builder image
-FROM pytorch-install as kernel-builder
+FROM pytorch-install AS kernel-builder
+
+ARG MAX_JOBS=8
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        ninja-build \
+        ninja-build cmake \
         && rm -rf /var/lib/apt/lists/*
 
-RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8 && \
-    /opt/conda/bin/conda clean -ya
-
 # Build Flash Attention CUDA kernels
-FROM kernel-builder as flash-att-builder
+FROM kernel-builder AS flash-att-builder
 
 WORKDIR /usr/src
 
@@ -100,48 +100,85 @@ COPY server/Makefile-flash-att Makefile
 RUN make build-flash-attention
 
 # Build Flash Attention v2 CUDA kernels
-FROM kernel-builder as flash-att-v2-builder
+FROM kernel-builder AS flash-att-v2-builder
 
 WORKDIR /usr/src
 
 COPY server/Makefile-flash-att-v2 Makefile
 
 # Build specific version of flash attention v2
-RUN make build-flash-attention-v2
+RUN make build-flash-attention-v2-cuda
 
 # Build Transformers exllama kernels
-FROM kernel-builder as exllama-kernels-builder
-
+FROM kernel-builder AS exllama-kernels-builder
 WORKDIR /usr/src
-
 COPY server/exllama_kernels/ .
 
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Transformers exllama kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/exllamav2_kernels/ .
 
 # Build specific version of transformers
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
-# Build Transformers CUDA kernels
-FROM kernel-builder as custom-kernels-builder
-
+# Build Transformers awq kernels
+FROM kernel-builder AS awq-kernels-builder
 WORKDIR /usr/src
+COPY server/Makefile-awq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
 
+# Build eetq kernels
+FROM kernel-builder AS eetq-kernels-builder
+WORKDIR /usr/src
+COPY server/Makefile-eetq Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+
+# Build marlin kernels
+FROM kernel-builder AS marlin-kernels-builder
+WORKDIR /usr/src
+COPY server/marlin/ .
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+
+# Build Lorax Punica kernels
+FROM kernel-builder AS lorax-punica-builder
+WORKDIR /usr/src
+COPY server/Makefile-lorax-punica Makefile
+# Build specific version of transformers
+RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
+
+# Build Transformers CUDA kernels
+FROM kernel-builder AS custom-kernels-builder
+WORKDIR /usr/src
 COPY server/custom_kernels/ .
-
 # Build specific version of transformers
 RUN python setup.py build
 
 # Build vllm CUDA kernels
-FROM kernel-builder as vllm-builder
+FROM kernel-builder AS vllm-builder
 
 WORKDIR /usr/src
 
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
 COPY server/Makefile-vllm Makefile
 
 # Build specific version of vllm
-RUN make build-vllm
+RUN make build-vllm-cuda
+
+# Build mamba kernels
+FROM kernel-builder AS mamba-builder
+WORKDIR /usr/src
+COPY server/Makefile-selective-scan Makefile
+RUN make build-all
 
 # Text Generation Inference base image
-FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
+FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
 
 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@@ -158,26 +195,41 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         libssl-dev \
         ca-certificates \
         make \
+        curl \
+        git \
         && rm -rf /var/lib/apt/lists/*
 
 # Copy conda with PyTorch installed
 COPY --from=pytorch-install /opt/conda /opt/conda
 
 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from awq kernels builder
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from eetq kernels builder
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# Copy build artifacts from marlin kernels builder
+COPY --from=marlin-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
 
 # Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from mamba builder
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
 
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
@@ -188,23 +240,27 @@ COPY server server
 COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
-    pip install -r requirements.txt && \
-    pip install ".[bnb, accelerate, quantize]" --no-cache-dir
-
-# Install benchmarker
-COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
-# Install router
-COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
-# Install launcher
-COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
+    pip install -r requirements_cuda.txt && \
+    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
 
+# Deps before the binaries
+# The binaries change on every build given we burn the SHA into them
+# The deps change less often.
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         build-essential \
         g++ \
         && rm -rf /var/lib/apt/lists/*
 
-# AWS Sagemaker compatbile image
-FROM base as sagemaker
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
 
 COPY sagemaker-entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
@@ -214,5 +270,8 @@ ENTRYPOINT ["./entrypoint.sh"]
 # Final image
 FROM base
 
-ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+# CMD ["--json-output"]
diff --git a/Dockerfile_amd b/Dockerfile_amd
new file mode 100644
index 00000000..0aebeee5
--- /dev/null
+++ b/Dockerfile_amd
@@ -0,0 +1,218 @@
+# Rust builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+# Text Generation Inference base image for RoCm
+FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update AS base
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    ccache \
+    curl \
+    git \
+    make \
+    libssl-dev \
+    g++ \
+    # Needed to build VLLM & flash.
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblas-dev \
+    hipblaslt-dev \
+    rocblas-dev \
+    hiprand-dev \
+    rocrand-dev \
+    miopen-hip-dev \
+    hipfft-dev \
+    hipcub-dev \
+    hipsolver-dev \
+    rccl-dev \
+    cmake \
+    python3-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Keep in sync with `server/pyproject.toml
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTORCH_VERSION='2.3.0'
+ARG ROCM_VERSION='6.0.2'
+ARG PYTHON_VERSION='3.10.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH /opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    mamba init && \
+    rm ~/mambaforge.sh
+
+# Install flash-attention, torch dependencies
+RUN pip install numpy einops ninja --no-cache-dir
+
+RUN conda install intel::mkl-static intel::mkl-include
+RUN pip uninstall -y triton && \
+    git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
+    cd triton/python && \
+    pip install .
+
+RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
+
+ARG _GLIBCXX_USE_CXX11_ABI="1"
+ARG CMAKE_PREFIX_PATH="/opt/conda"
+ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ARG BUILD_CAFFE2="0" \
+    BUILD_CAFFE2_OPS="0" \
+    USE_CUDA="0" \
+    USE_ROCM="1" \
+    BUILD_TEST="0" \
+    USE_FBGEMM="0" \
+    USE_NNPACK="0" \
+    USE_QNNPACK="0" \
+    USE_XNNPACK="0" \
+    USE_FLASH_ATTENTION="1" \
+    USE_MEM_EFF_ATTENTION="0"
+
+RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
+
+# Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
+ENV HIP_FORCE_DEV_KERNARG=1
+
+# On MI250 and MI300, performances for flash with Triton FA are slightly better than CK.
+# However, Triton requires a tunning for each prompt length, which is prohibitive.
+ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
+
+FROM base AS kernel-builder
+
+# # Build vllm kernels
+FROM kernel-builder AS vllm-builder
+WORKDIR /usr/src
+
+COPY server/Makefile-vllm Makefile
+
+# Build specific version of vllm
+RUN make build-vllm-rocm
+
+# Build Flash Attention v2 kernels
+FROM kernel-builder AS flash-att-v2-builder
+WORKDIR /usr/src
+
+COPY server/Makefile-flash-att-v2 Makefile
+
+# Build specific version of flash attention v2
+RUN make build-flash-attention-v2-rocm
+
+# Build Transformers CUDA kernels (gpt-neox and bloom)
+FROM kernel-builder AS custom-kernels-builder
+WORKDIR /usr/src
+COPY server/custom_kernels/ .
+RUN python setup.py build
+
+# Build exllama kernels
+FROM kernel-builder AS exllama-kernels-builder
+WORKDIR /usr/src
+COPY server/exllama_kernels/ .
+
+RUN python setup.py build
+
+# Build exllama v2 kernels
+FROM kernel-builder AS exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/exllamav2_kernels/ .
+
+RUN python setup.py build
+
+FROM base AS base-copy
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+# Copy builds artifacts from vllm builder
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from flash attention v2 builder
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from custom kernels builder
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_rocm.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+# AWS Sagemaker compatible image
+FROM base AS sagemaker
+
+COPY sagemaker-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+
+# Final image
+FROM base-copy
+
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+RUN chmod +x /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
+CMD ["--json-output"]
diff --git a/Dockerfile_intel b/Dockerfile_intel
new file mode 100644
index 00000000..6a803a32
--- /dev/null
+++ b/Dockerfile_intel
@@ -0,0 +1,177 @@
+ARG PLATFORM=xpu
+
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
+WORKDIR /usr/src
+
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
+FROM chef AS planner
+COPY Cargo.lock Cargo.lock
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
+
+RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
+    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
+    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
+    rm -f $PROTOC_ZIP
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --profile release-opt --recipe-path recipe.json
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY benchmark benchmark
+COPY router router
+COPY launcher launcher
+RUN cargo build --profile release-opt
+
+
+# Text Generation Inference base image for Intel
+
+FROM intel/intel-extension-for-pytorch:2.1.30-xpu AS xpu
+
+USER root
+# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
+RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
+    dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
+
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
+
+# Text Generation Inference base env
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+
+WORKDIR /usr/src
+RUN wget https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl && pip install torch-2.1.0.post1+cxx11.abi-cp310-cp310-linux_x86_64.whl
+RUN pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b distributed origin/dev/distributed
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_intel.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
+ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
+ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
+ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
+ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+ENV CCL_ZE_IPC_EXCHANGE=sockets
+ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
+ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
+
+RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=OFF BUILD_WITH_CPU=OFF USE_XETLA=ON python setup.py install && rm -rf /usr/src/intel-extension-for-pytorch
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+
+# Text Generation Inference base image for Intel-cpu
+FROM ubuntu:22.04 AS cpu
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    curl \
+    ca-certificates \
+    make \
+    g++ \
+    git \
+    wget \
+    cmake
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PORT=80
+
+ARG MAMBA_VERSION=23.1.0-1
+ARG PYTHON_VERSION='3.10.10'
+# Automatically set by buildx
+ARG TARGETPLATFORM
+ENV PATH /opt/conda/bin:$PATH
+
+# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
+# Install mamba
+# translating Docker's TARGETPLATFORM into mamba arches
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+         *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
+RUN chmod +x ~/mambaforge.sh && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN conda install -c conda-forge gperftools mkl
+
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
+RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
+RUN pip install triton
+
+WORKDIR /usr/src
+
+RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout eda7a7c42df6f9a64e0de9c2b69304ee02f2c32a
+
+RUN git clone https://github.com/intel/torch-ccl.git && cd torch-ccl && git checkout ccl_torch_dev_0131
+
+RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update --init --recursive && python setup.py install
+
+RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
+
+ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so:/opt/conda/lib/libiomp5.so
+ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
+ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
+ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
+ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
+ENV KMP_BLOCKTIME=1
+ENV KMP_TPAUSE=0
+ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
+ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
+ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+
+# Install server
+COPY proto proto
+COPY server server
+COPY server/Makefile server/Makefile
+RUN cd server && \
+    make gen-server && \
+    pip install -r requirements_intel.txt && \
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir
+
+# Install benchmarker
+COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
+# Install router
+COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
+# Install launcher
+COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+
+FROM ${PLATFORM} AS final
+ENTRYPOINT ["text-generation-launcher"]
+CMD ["--json-output"]
diff --git a/LICENSE b/LICENSE
index 19a34fcf..7d0e8034 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,181 +1,201 @@
-Hugging Face Optimized Inference License 1.0 (HFOILv1.0)
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
 
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 
-This License Agreement governs the use of the Software and its Modifications. It is a
-binding agreement between the Licensor and You.
+   1. Definitions.
 
-This License Agreement shall be referred to as Hugging Face Optimized Inference License
-1.0 or HFOILv1.0. We may publish revised versions of this License Agreement from time to
-time. Each version will be given a distinguished number.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
 
-By downloading, accessing, modifying, distributing or otherwise using the Software, You
-consent to all of the terms and conditions below. So, if You do not agree with those,
-please do not download, access, modify, distribute, or use the Software.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
 
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
 
-1. PERMISSIONS
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
 
-You may use, modify and distribute the Software pursuant to the following terms and
-conditions:
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
 
-Copyright License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free, copyright license to reproduce, prepare, publicly display,
-publicly perform, sublicense under the terms herein, and distribute the Software and
-Modifications of the Software.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
 
-Patent License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free patent license to make, have made, Use, offer to sell, sell,
-import, and otherwise transfer the Software, where such license applies only to those
-patent claims licensable by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s) with the Software to
-which such Contribution(s) was submitted. If You institute patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Software
-or a Contribution incorporated within the Software constitutes direct or contributory
-patent infringement, then any rights granted to You under this License Agreement for the
-Software shall terminate as of the date such litigation is filed.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
 
-No other rights. All rights not expressly granted herein are retained.
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
 
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
 
-2. RESTRICTIONS
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
 
-You may not distribute the Software as a hosted or managed, and paid service, where the
-service grants users access to any substantial set of the features or functionality of the
-Software. If you wish to do so, You will need to be granted additional rights from the
-Licensor which will be subject to a separate mutually agreed agreement.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
 
-You may not sublicense the Software under any other terms than those listed in this
-License.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
 
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
 
-3. OBLIGATIONS
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
 
-When You modify the Software, You agree to: - attach a notice stating the Modifications of
-the Software You made; and - attach a notice stating that the Modifications of the
-Software are released under this License Agreement.
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
 
-When You distribute the Software or Modifications of the Software, You agree to: - give
-any recipients of the Software a copy of this License Agreement; - retain all Explanatory
-Documentation; and if sharing the Modifications of the Software, add Explanatory
-Documentation documenting the changes made to create the Modifications of the Software; -
-retain all copyright, patent, trademark and attribution notices.
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
 
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
 
-4. MISCELLANEOUS
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
 
-Termination. Licensor reserves the right to restrict Use of the Software in violation of
-this License Agreement, upon which Your licenses will automatically terminate.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
 
-Contributions. Unless You explicitly state otherwise, any Contribution intentionally
-submitted for inclusion in the Software by You to the Licensor shall be under the terms
-and conditions of this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify the terms of any
-separate license agreement you may have executed with Licensor regarding such
-Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
 
-Trademarks and related. Nothing in this License Agreement permits You (i) to make Use of
-Licensors’ trademarks, trade names, or logos, (ii) otherwise suggest endorsement by
-Licensor, or (iii) misrepresent the relationship between the parties; and any rights not
-expressly granted herein are reserved by the Licensors.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
 
-Output You generate. Licensor claims no rights in the Output. You agree not to contravene
-any provision as stated in the License Agreement with your Use of the Output.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
 
-Disclaimer of Warranty. Except as expressly provided otherwise herein, and to the fullest
-extent permitted by law, Licensor provides the Software (and each Contributor provides its
-Contributions) AS IS, and Licensor disclaims all warranties or guarantees of any kind,
-express or implied, whether arising under any law or from any usage in trade, or otherwise
-including but not limited to the implied warranties of merchantability, non-infringement,
-quiet enjoyment, fitness for a particular purpose, or otherwise. You are solely
-responsible for determining the appropriateness of the Software and Modifications of the
-Software for your purposes (including your use or distribution of the Software and
-Modifications of the Software), and assume any risks associated with Your exercise of
-permissions under this License Agreement.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
 
-Limitation of Liability. In no event and under no legal theory, whether in tort (including
-negligence), contract, or otherwise, unless required by applicable law (such as deliberate
-and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to
-You for damages, including any direct, indirect, special, incidental, or consequential
-damages of any character arising as a result of this License Agreement or out of the Use
-or inability to Use the Software (including but not limited to damages for loss of
-goodwill, work stoppage, computer failure or malfunction, model failure or malfunction, or
-any and all other commercial damages or losses), even if such Contributor has been advised
-of the possibility of such damages.
+   END OF TERMS AND CONDITIONS
 
-Accepting Warranty or Additional Liability. While sharing the Software or Modifications of
-the Software thereof, You may choose to offer and charge a fee for, acceptance of support,
-warranty, indemnity, or other liability obligations and/or rights consistent with this
-License Agreement. However, in accepting such obligations, You may act only on Your own
-behalf and on Your sole responsibility, not on behalf of Licensor or any other
-Contributor, and you hereby agree to indemnify, defend, and hold Licensor and each other
-Contributor (and their successors or assigns) harmless for any liability incurred by, or
-claims asserted against, such Licensor or Contributor (and their successors or assigns) by
-reason of your accepting any such warranty or additional liability.
+   APPENDIX: How to apply the Apache License to your work.
 
-Severability. This License Agreement is a license of copyright and patent rights and an
-agreement in contract between You and the Licensor. If any provision of this License
-Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall
-be unaffected thereby and remain valid as if such provision had not been set forth herein.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
 
+   Copyright 2022 Hugging Face
 
-5. DEFINITIONS
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-“Contribution” refers to any work of authorship, including the original version of the
-Software and any Modifications of the Software that is intentionally submitted to Licensor
-for inclusion in the Software by the copyright owner or by an individual or entity
-authorized to submit on behalf of the copyright owner. For the purposes of this
-definition, “submitted” means any form of electronic, verbal, or written communication
-sent to the Licensor or its representatives, including but not limited to communication on
-electronic mailing lists, source code control systems, and issue tracking systems that are
-managed by, or on behalf of, the Licensor for the purpose of discussing and improving the
-Software, but excluding communication that is conspicuously marked or otherwise designated
-in writing by the copyright owner as “Not a Contribution.”
+       http://www.apache.org/licenses/LICENSE-2.0
 
-“Contributor” refers to Licensor and any individual or entity on behalf of whom a
-Contribution has been received by Licensor and subsequently incorporated within the
-Software.
-
-“Data” refers to a collection of information extracted from the dataset used with the
-Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not
-licensed under this License Agreement.
-
-“Explanatory Documentation” refers to any documentation or related information including
-but not limited to model cards or data cards dedicated to inform the public about the
-characteristics of the Software. Explanatory documentation is not licensed under this
-License.
-
-"License Agreement" refers to these terms and conditions.
-
-“Licensor” refers to the rights owners or entity authorized by the rights owners that are
-granting the terms and conditions of this License Agreement.
-
-“Model” refers to machine-learning based assemblies (including checkpoints), consisting of
-learnt weights and parameters (including optimizer states), corresponding to a model
-architecture as embodied in Software source code. Source code is not licensed under this
-License Agreement.
-
-“Modifications of the Software” refers to all changes to the Software, including without
-limitation derivative works of the Software.
-
-“Output” refers to the results of operating the Software.
-
-“Share” refers to any transmission, reproduction, publication or other sharing of the
-Software or Modifications of the Software to a third party, including providing the
-Softwaire as a hosted service made available by electronic or other remote means,
-including - but not limited to - API-based or web access.
-
-“Software” refers to the software and Model (or parts of either) that Licensor makes
-available under this License Agreement.
-
-“Third Parties” refers to individuals or legal entities that are not under common control
-with Licensor or You.
-
-“Use” refers to anything You or your representatives do with the Software, including but
-not limited to generating any Output, fine tuning, updating, running, training, evaluating
-and/or reparametrizing the Model.
-
-"You" (or "Your")  refers to an individual or Legal Entity exercising permissions granted
-by this License Agreement and/or making Use of the Software for whichever purpose and in
-any field of Use.
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
index 7f534c7c..a1399b6d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,8 @@
 install-server:
 	cd server && make install
 
-install-custom-kernels:
-	if [ "$$BUILD_EXTENSIONS" = "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need to set the BUILD_EXTENSIONS environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
-
-install-integration-tests:
-	cd integration-tests && pip install -r requirements.txt
-	cd clients/python && pip install .
+install-server-cpu:
+	cd server && make install-server
 
 install-router:
 	cd router && cargo install --path .
@@ -17,7 +13,10 @@ install-launcher:
 install-benchmark:
 	cd benchmark && cargo install --path .
 
-install: install-server install-router install-launcher install-custom-kernels
+install: install-server install-router install-launcher
+
+
+install-cpu: install-server-cpu install-router install-launcher
 
 server-dev:
 	cd server && make run-dev
@@ -28,6 +27,10 @@ router-dev:
 rust-tests: install-router install-launcher
 	cargo test
 
+install-integration-tests:
+	cd integration-tests && pip install -r requirements.txt
+	cd clients/python && pip install .
+
 integration-tests: install-integration-tests
 	pytest -s -vv -m "not private" integration-tests
 
diff --git a/README.md b/README.md
index 739e656b..4c1c1e29 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 <div align="center">
 
-![image](https://github.com/huggingface/text-generation-inference/assets/3841370/38ba1531-ea0d-4851-b31a-a6d4ddc944b0)
+<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
+  <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
+</a>
 
 # Text Generation Inference
 
@@ -18,116 +20,84 @@ to power Hugging Chat, the Inference API and Inference Endpoint.
 
 ## Table of contents
 
-- [Features](#features)
-- [Optimized Architectures](#optimized-architectures)
 - [Get Started](#get-started)
-  - [Docker](#docker)
   - [API Documentation](#api-documentation)
   - [Using a private or gated model](#using-a-private-or-gated-model)
   - [A note on Shared Memory](#a-note-on-shared-memory-shm)
   - [Distributed Tracing](#distributed-tracing)
   - [Local Install](#local-install)
   - [CUDA Kernels](#cuda-kernels)
-- [Run Falcon](#run-falcon)
+- [Optimized architectures](#optimized-architectures)
+- [Run Mistral](#run-a-model)
   - [Run](#run)
   - [Quantization](#quantization)
 - [Develop](#develop)
 - [Testing](#testing)
-- [Other supported hardware](#other-supported-hardware)
 
-## Features
+Text Generation Inference (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and [more](https://huggingface.co/docs/text-generation-inference/supported_models). TGI implements many features, such as:
 
-- Serve the most popular Large Language Models with a simple launcher
+- Simple launcher to serve most popular LLMs
+- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
 - Tensor Parallelism for faster inference on multiple GPUs
 - Token streaming using Server-Sent Events (SSE)
-- [Continuous batching of incoming requests](https://github.com/huggingface/text-generation-inference/tree/main/router) for increased total throughput
-- Optimized transformers code for inference using [flash-attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
-- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323)
+- Continuous batching of incoming requests for increased total throughput
+- Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention) and [Paged Attention](https://github.com/vllm-project/vllm) on the most popular architectures
+- Quantization with :
+  - [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+  - [GPT-Q](https://arxiv.org/abs/2210.17323)
+  - [EETQ](https://github.com/NetEase-FuXi/EETQ)
+  - [AWQ](https://github.com/casper-hansen/AutoAWQ)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
 - Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 - Logits warper (temperature scaling, top-p, top-k, repetition penalty, more details see [transformers.LogitsProcessor](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.LogitsProcessor))
 - Stop sequences
 - Log probabilities
-- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
-- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output.
-- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance.
+- [Speculation](https://huggingface.co/docs/text-generation-inference/conceptual/speculation) ~2x latency
+- [Guidance/JSON](https://huggingface.co/docs/text-generation-inference/conceptual/guidance). Specify output format to speed up inference and make sure the output is valid according to some specs..
+- Custom Prompt Generation: Easily generate text by providing custom prompts to guide the model's output
+- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance
+
+### Hardware support
+
+- [Nvidia](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference)
+- [AMD](https://github.com/huggingface/text-generation-inference/pkgs/container/text-generation-inference) (-rocm)
+- [Inferentia](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference)
+- [Intel GPU](https://github.com/huggingface/text-generation-inference/pull/1475)
+- [Gaudi](https://github.com/huggingface/tgi-gaudi)
+- [Google TPU](https://huggingface.co/docs/optimum-tpu/howto/serving)
 
 
-## Optimized architectures
-
-- [BLOOM](https://huggingface.co/bigscience/bloom)
-- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl)
-- [Galactica](https://huggingface.co/facebook/galactica-120b)
-- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
-- [Llama](https://github.com/facebookresearch/llama)
-- [OPT](https://huggingface.co/facebook/opt-66b)
-- [SantaCoder](https://huggingface.co/bigcode/santacoder)
-- [Starcoder](https://huggingface.co/bigcode/starcoder)
-- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
-- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
-- [MPT](https://huggingface.co/mosaicml/mpt-30b)
-- [Llama V2](https://huggingface.co/meta-llama)
-
-Other architectures are supported on a best effort basis using:
-
-`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
-
-or
-
-`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
-
-## Get started
+## Get Started
 
 ### Docker
 
-The easiest way of getting started is using the official Docker container:
+For a detailed starting guide, please see the [Quick Tour](https://huggingface.co/docs/text-generation-inference/quicktour). The easiest way of getting started is using the official Docker container:
 
 ```shell
-model=tiiuae/falcon-7b-instruct
-volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+model=HuggingFaceH4/zephyr-7b-beta
+# share a volume with the Docker container to avoid downloading weights every run
+volume=$PWD/data
 
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model
-```
-**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
-
-To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
-```
-text-generation-launcher --help
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1 --model-id $model
 ```
 
-You can then query the model using either the `/generate` or `/generate_stream` routes:
+And then you can make requests like
 
-```shell
-curl 127.0.0.1:8080/generate \
-    -X POST \
-    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
-    -H 'Content-Type: application/json'
-```
-
-```shell
+```bash
 curl 127.0.0.1:8080/generate_stream \
     -X POST \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
     -H 'Content-Type: application/json'
 ```
 
-or from Python:
+**Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar.
 
-```shell
-pip install text-generation
+**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.1.1-rocm --model-id $model` instead of the command above.
+
+To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli):
 ```
-
-```python
-from text_generation import Client
-
-client = Client("http://127.0.0.1:8080")
-print(client.generate("What is Deep Learning?", max_new_tokens=20).generated_text)
-
-text = ""
-for response in client.generate_stream("What is Deep Learning?", max_new_tokens=20):
-    if not response.token.special:
-        text += response.token.text
-print(text)
+text-generation-launcher --help
 ```
 
 ### API documentation
@@ -137,14 +107,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat
 
 ### Using a private or gated model
 
-You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
+You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
 `text-generation-inference`. This allows you to gain access to protected resources.
 
 For example, if you want to serve the gated Llama V2 model variants:
 
 1. Go to https://huggingface.co/settings/tokens
 2. Copy your cli READ token
-3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
+3. Export `HF_TOKEN=<your cli READ token>`
 
 or with Docker:
 
@@ -153,7 +123,7 @@ model=meta-llama/Llama-2-7b-chat-hf
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 token=<your cli READ token>
 
-docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model
+docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
 ```
 
 ### A note on Shared Memory (shm)
@@ -185,7 +155,12 @@ this will impact performance.
 ### Distributed Tracing
 
 `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
-by setting the address to an OTLP collector with the `--otlp-endpoint` argument.
+by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
+overridden with the `--otlp-service-name` argument
+
+### Architecture
+
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
 
 ### Local install
 
@@ -197,7 +172,7 @@ Python 3.9, e.g. using `conda`:
 ```shell
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 
-conda create -n text-generation-inference python=3.9
+conda create -n text-generation-inference python=3.11
 conda activate text-generation-inference
 ```
 
@@ -223,7 +198,7 @@ Then run:
 
 ```shell
 BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
-make run-falcon-7b-instruct
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```
 
 **Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run:
@@ -232,19 +207,26 @@ make run-falcon-7b-instruct
 sudo apt-get install libssl-dev gcc -y
 ```
 
-### CUDA Kernels
+## Optimized architectures
 
-The custom CUDA kernels are only tested on NVIDIA A100s. If you have any installation or runtime issues, you can remove
-the kernels by using the `DISABLE_CUSTOM_KERNELS=True` environment variable.
+TGI works out of the box to serve optimized models for all modern models. They can be found in [this list](https://huggingface.co/docs/text-generation-inference/supported_models).
 
-Be aware that the official Docker image has them enabled by default.
+Other architectures are supported on a best-effort basis using:
 
-## Run Falcon
+`AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
+
+or
+
+`AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
+
+
+
+## Run locally
 
 ### Run
 
 ```shell
-make run-falcon-7b-instruct
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 ```
 
 ### Quantization
@@ -252,7 +234,7 @@ make run-falcon-7b-instruct
 You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
 
 ```shell
-make run-falcon-7b-instruct-quantize
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
 ```
 
 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.
@@ -277,10 +259,3 @@ make rust-tests
 # integration tests
 make integration-tests
 ```
-
-
-## Other supported hardware
-
-TGI is also supported on the following AI hardware accelerators:
-- *Habana first-gen Gaudi and Gaudi2:* checkout [here](https://github.com/huggingface/optimum-habana/tree/main/text-generation-inference) how to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
-
diff --git a/assets/architecture.jpg b/assets/architecture.jpg
deleted file mode 100644
index c4a511c9..00000000
Binary files a/assets/architecture.jpg and /dev/null differ
diff --git a/assets/architecture.png b/assets/architecture.png
new file mode 100644
index 00000000..1bcd1283
Binary files /dev/null and b/assets/architecture.png differ
diff --git a/assets/tgi_grafana.json b/assets/tgi_grafana.json
new file mode 100644
index 00000000..5f5a74ad
--- /dev/null
+++ b/assets/tgi_grafana.json
@@ -0,0 +1,3999 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS_EKS API INFERENCE PROD",
+      "label": "Prometheus EKS API Inference Prod",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "panel",
+      "id": "gauge",
+      "name": "Gauge",
+      "version": ""
+    },
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "10.0.2"
+    },
+    {
+      "type": "panel",
+      "id": "heatmap",
+      "name": "Heatmap",
+      "version": ""
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 2,
+  "id": 551,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "fieldMinMax": false,
+          "mappings": [],
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1000
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 0,
+        "y": 0
+      },
+      "id": 49,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m]))) * 1000) > 0",
+          "hide": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m]))) * 1000) > 0",
+          "hide": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "name": "Expression",
+            "type": "__expr__",
+            "uid": "__expr__"
+          },
+          "expression": "$B + $C",
+          "hide": false,
+          "refId": "D",
+          "type": "math"
+        }
+      ],
+      "title": "Time to first token",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 8,
+        "x": 9,
+        "y": 0
+      },
+      "id": 44,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "(histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m]))) * 1000)>0",
+          "instant": false,
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Decode per-token latency",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 7,
+        "w": 7,
+        "x": 17,
+        "y": 0
+      },
+      "id": 45,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "mean"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "sum((rate(tgi_request_generated_tokens_sum{container=\"$service\"}[10m]) / rate(tgi_request_generated_tokens_count{container=\"$service\"}[10m]))>0)",
+          "instant": false,
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Throughput (generated tok/s)",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 7
+      },
+      "id": 48,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Number of tokens per prompt",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 7
+      },
+      "id": 30,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_generated_tokens_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Number of generated tokens per request",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 15
+      },
+      "id": 20,
+      "panels": [],
+      "title": "General",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 30,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 0,
+        "y": 16
+      },
+      "id": 4,
+      "maxDataPoints": 100,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "sum(increase(tgi_request_success{container=\"$service\"}[1m]))",
+          "legendFormat": "Success",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "sum(increase(tgi_request_failure{container=\"$service\"}[1m])) by (err)",
+          "hide": false,
+          "legendFormat": "Error: {{err}}",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Requests",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 13,
+        "w": 9,
+        "x": 6,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Mean Time Per Token quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 13,
+        "w": 9,
+        "x": 15,
+        "y": 16
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 13,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_request_mean_time_per_token_duration_bucket{container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Mean Time Per Token",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 3,
+        "x": 0,
+        "y": 24
+      },
+      "id": 18,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": false
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "9.1.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "count(tgi_request_count{container=\"$service\"})",
+          "legendFormat": "Replicas",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Number of replicas",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [],
+          "thresholds": {
+            "mode": "percentage",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "orange",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 3,
+        "x": 3,
+        "y": 24
+      },
+      "id": 32,
+      "options": {
+        "minVizHeight": 75,
+        "minVizWidth": 75,
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true,
+        "sizing": "auto"
+      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "sum(tgi_queue_size{container=\"$service\"})",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Queue Size",
+      "type": "gauge"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 29
+      },
+      "id": 26,
+      "panels": [],
+      "title": "Batching",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 50,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 5,
+        "w": 6,
+        "x": 0,
+        "y": 30
+      },
+      "id": 29,
+      "maxDataPoints": 40,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": false
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "9.1.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "avg(tgi_batch_current_max_tokens{container=\"$service\"})",
+          "legendFormat": "{{ pod }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Max tokens per batch",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 4,
+        "x": 6,
+        "y": 30
+      },
+      "id": 33,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_skipped_tokens_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Speculated Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 5,
+        "x": 10,
+        "y": 30
+      },
+      "id": 46,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_input_length_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Prompt Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 9,
+        "x": 15,
+        "y": 30
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_duration_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Latency quantiles",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 50,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "normal"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 35
+      },
+      "id": 27,
+      "maxDataPoints": 40,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": false
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "9.1.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "avg(tgi_batch_current_size{container=\"$service\"})",
+          "legendFormat": "{{ pod }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Batch Size",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 30,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 6,
+        "x": 0,
+        "y": 39
+      },
+      "id": 28,
+      "maxDataPoints": 100,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "sum(increase(tgi_batch_concat{container=\"$service\"}[1m])) by (reason)",
+          "hide": false,
+          "legendFormat": "Reason: {{ reason }}",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Concatenates",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 9,
+        "w": 9,
+        "x": 6,
+        "y": 39
+      },
+      "id": 31,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_request_queue_duration_bucket{container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Queue quantiles",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 48
+      },
+      "id": 22,
+      "panels": [],
+      "title": "Prefill",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 12,
+        "x": 0,
+        "y": 49
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Prefill Quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 12,
+        "x": 12,
+        "y": 49
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 14,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"prefill\", container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Prefill Latency",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 60
+      },
+      "id": 24,
+      "panels": [],
+      "title": "Decode",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 12,
+        "x": 0,
+        "y": 61
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Decode quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 12,
+        "x": 12,
+        "y": 61
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 15,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_batch_inference_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Decode Latency",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 72
+      },
+      "id": 43,
+      "panels": [],
+      "title": "Debug",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 0,
+        "y": 73
+      },
+      "id": 38,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Forward quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 6,
+        "y": 73
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 35,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_batch_forward_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Forward Latency",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 12,
+        "y": 73
+      },
+      "id": 34,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Token Decode quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 18,
+        "y": 73
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 40,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_batch_decode_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Token Decode Latency",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 0,
+        "y": 84
+      },
+      "id": 42,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Filter Batch quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 6,
+        "y": 84
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 39,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_batch_filter_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Filter Batch Latency",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p50"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "green",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p90"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "orange",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "p99"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red",
+                  "mode": "fixed"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 12,
+        "y": 84
+      },
+      "id": 36,
+      "options": {
+        "legend": {
+          "calcs": [
+            "min",
+            "max"
+          ],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "legendFormat": "p50",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p90",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[10m])))",
+          "hide": false,
+          "legendFormat": "p99",
+          "range": true,
+          "refId": "C"
+        }
+      ],
+      "title": "Batch Concat quantiles",
+      "type": "timeseries"
+    },
+    {
+      "cards": {},
+      "color": {
+        "cardColor": "#5794F2",
+        "colorScale": "linear",
+        "colorScheme": "interpolateSpectral",
+        "exponent": 0.5,
+        "min": 0,
+        "mode": "opacity"
+      },
+      "dataFormat": "tsbuckets",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 11,
+        "w": 6,
+        "x": 18,
+        "y": 84
+      },
+      "heatmap": {},
+      "hideZeroBuckets": false,
+      "highlightCards": true,
+      "id": 41,
+      "legend": {
+        "show": false
+      },
+      "maxDataPoints": 25,
+      "options": {
+        "calculate": false,
+        "calculation": {},
+        "cellGap": 2,
+        "cellValues": {},
+        "color": {
+          "exponent": 0.5,
+          "fill": "#5794F2",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 128
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": false
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "showValue": "never",
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "decimals": 1,
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "pluginVersion": "10.4.2",
+      "reverseYBuckets": false,
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+          },
+          "editorMode": "code",
+          "exemplar": true,
+          "expr": "sum(increase(tgi_batch_concat_duration_bucket{method=\"decode\", container=\"$service\"}[5m])) by (le)",
+          "format": "heatmap",
+          "interval": "",
+          "legendFormat": "{{ le }}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Batch Concat latency",
+      "tooltip": {
+        "show": true,
+        "showHistogram": false
+      },
+      "type": "heatmap",
+      "xAxis": {
+        "show": true
+      },
+      "yAxis": {
+        "decimals": 1,
+        "format": "s",
+        "logBase": 1,
+        "show": true
+      },
+      "yBucketBound": "auto"
+    }
+  ],
+  "refresh": "",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1",
+          "value": "gpu-txt-gen-cohereforai-c4ai-command-r-plu-ba7f1"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS_EKS API INFERENCE PROD}"
+        },
+        "definition": "label_values(tgi_request_count, container)",
+        "hide": 0,
+        "includeAll": false,
+        "multi": false,
+        "name": "service",
+        "options": [],
+        "query": {
+          "query": "label_values(tgi_request_count, container)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-30m",
+    "to": "now-30s"
+  },
+  "timepicker": {
+    "nowDelay": "30s"
+  },
+  "timezone": "",
+  "title": "Text Generation Inference",
+  "uid": "RHSk7EL4kdqsd",
+  "version": 12,
+  "weekStart": ""
+}
diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml
index 67e04f0a..756460e0 100644
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@@ -14,18 +14,18 @@ name = "text-generation-benchmark"
 path = "src/main.rs"
 
 [dependencies]
-average = "0.13"
-clap = { version = "4.1.4", features = ["derive", "env"] }
-crossterm = "0.26"
+average = "0.14"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+crossterm = "0.27"
 float-ord = "0.3.2"
-serde = {version = "1.0.142", features = ["derive"]}
+serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
-tabled = "0.12.0"
+tabled = "0.14.0"
 text-generation-client = { path = "../router/client" }
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
+thiserror = "1.0.48"
+tokenizers = { workspace = true }
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
+tui = {package = "ratatui", version = "0.23", default-features = false, features = ["crossterm"]}
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
-
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
+hf-hub = { workspace = true }
diff --git a/benchmark/README.md b/benchmark/README.md
index 7f51a731..17a02a30 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -6,12 +6,12 @@
 
 </div>
 
-A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) 
+A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
 and powered by [tui](https://github.com/tui-rs-revival/ratatui).
 
-## Install 
+## Install
 
-```shell 
+```shell
 make install-benchmark
 ```
 
@@ -27,4 +27,4 @@ Then run the benchmarking tool:
 
 ```shell
 text-generation-benchmark --tokenizer-name bigscience/bloom-560m
-```
\ No newline at end of file
+```
diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs
index 6a9881fb..a0a9313a 100644
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@@ -6,7 +6,7 @@ use tokio::sync::mpsc;
 use tui::backend::Backend;
 use tui::layout::{Alignment, Constraint, Direction, Layout};
 use tui::style::{Color, Modifier, Style};
-use tui::text::{Span, Spans};
+use tui::text::{Line, Span};
 use tui::widgets::{
     Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
 };
@@ -244,7 +244,7 @@ impl App {
             .batch_size
             .iter()
             .map(|b| {
-                Spans::from(vec![Span::styled(
+                Line::from(vec![Span::styled(
                     format!("Batch: {b}"),
                     Style::default().fg(Color::White),
                 )])
@@ -444,7 +444,7 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga
 }
 
 /// Throughput paragraph
-fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
+fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> Paragraph<'a> {
     // Throughput average/high/low texts
     let throughput_texts = statis_spans(throughput, "tokens/secs");
 
@@ -457,7 +457,7 @@ fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragr
 }
 
 /// Latency paragraph
-fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
+fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Paragraph<'a> {
     // Latency average/high/low texts
     let mut latency_texts = statis_spans(latency, "ms");
 
@@ -466,9 +466,9 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
     let latency_percentiles = crate::utils::percentiles(latency, &[50, 90, 99]);
 
     // Latency p50/p90/p99 texts
-    let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
+    let colors = [Color::LightGreen, Color::LightYellow, Color::LightRed];
     for (i, (name, value)) in latency_percentiles.iter().enumerate() {
-        let span = Spans::from(vec![Span::styled(
+        let span = Line::from(vec![Span::styled(
             format!("{name}:     {value:.2} ms"),
             Style::default().fg(colors[i]),
         )]);
@@ -483,30 +483,30 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
 }
 
 /// Average/High/Low spans
-fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
+fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
     vec![
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
             format!(
                 "Average: {:.2} {unit}",
                 data.iter().sum::<f64>() / data.len() as f64
             ),
             Style::default().fg(Color::LightBlue),
         )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
             format!(
                 "Lowest:  {:.2} {unit}",
                 data.iter()
                     .min_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
             ),
             Style::default().fg(Color::Reset),
         )]),
-        Spans::from(vec![Span::styled(
+        Line::from(vec![Span::styled(
             format!(
                 "Highest: {:.2} {unit}",
                 data.iter()
                     .max_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
             ),
             Style::default().fg(Color::Reset),
         )]),
@@ -543,7 +543,7 @@ fn latency_histogram<'a>(
 
 /// Latency/Throughput chart
 fn latency_throughput_chart<'a>(
-    latency_throughput: &'a Vec<(f64, f64)>,
+    latency_throughput: &'a [(f64, f64)],
     batch_sizes: &'a [u32],
     zoom: bool,
     name: &'static str,
@@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>(
     let min_latency: f64 = *latency_iter
         .clone()
         .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let max_latency: f64 = *latency_iter
         .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let min_throughput: f64 = *throughput_iter
         .clone()
         .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let max_throughput: f64 = *throughput_iter
         .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
 
     // Char min max values
     let min_x = if zoom {
diff --git a/benchmark/src/event.rs b/benchmark/src/event.rs
index 91ce8400..07482aed 100644
--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@@ -11,7 +11,7 @@ pub(crate) enum Event {
     /// Key press.
     Key(event::KeyEvent),
     /// Terminal resize.
-    Resize(u16, u16),
+    Resize,
 }
 
 pub(crate) async fn terminal_event_task(
@@ -47,8 +47,8 @@ async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
         if event::poll(Duration::from_secs(0)).expect("no events available") {
             match event::read().expect("unable to read event") {
                 event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
-                event::Event::Resize(w, h) => {
-                    event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
+                event::Event::Resize(_w, _h) => {
+                    event_sender.send(Event::Resize).await.unwrap_or(())
                 }
                 _ => (),
             }
diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs
index 67afa04e..5e739703 100644
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@@ -1,8 +1,9 @@
 use std::time::{Duration, Instant};
-use text_generation_client::{
-    Batch, CachedBatch, ClientError, NextTokenChooserParameters, Request, ShardedClient,
+use text_generation_client::v3::{
+    Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
     StoppingCriteriaParameters,
 };
+use text_generation_client::{Chunk, ClientError, Input};
 use tokenizers::{Tokenizer, TruncationDirection};
 use tokio::sync::{broadcast, mpsc};
 
@@ -142,6 +143,9 @@ async fn prefill(
         .map(|id| Request {
             id: id.into(),
             prefill_logprobs: false,
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text(sequence.clone()).into()],
+            }),
             inputs: sequence.clone(),
             truncate: sequence_length,
             parameters: Some(parameters.clone()),
@@ -151,6 +155,9 @@ async fn prefill(
                 ignore_eos_token: true, // Will not stop even if a eos token is generated
             }),
             top_n_tokens: top_n_tokens.unwrap_or(0),
+            blocks: vec![],
+            slots: vec![],
+            adapter_id: None,
         })
         .collect();
 
@@ -159,11 +166,12 @@ async fn prefill(
         requests,
         size: batch_size,
         max_tokens: batch_size * (sequence_length + decode_length),
+        max_blocks: 0,
     };
 
     // Run prefill
     let start_time = Instant::now();
-    let (_, decode_batch) = client.prefill(batch.clone()).await?;
+    let (_, decode_batch, _) = client.prefill(batch.clone()).await?;
 
     // Get latency
     let latency = start_time.elapsed();
diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs
index 433c6f67..c33d64e6 100644
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{NextTokenChooserParameters, ShardedClient};
+use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;
@@ -30,10 +30,11 @@ pub async fn run(
     top_p: Option<f32>,
     typical_p: Option<f32>,
     repetition_penalty: Option<f32>,
+    frequency_penalty: Option<f32>,
     watermark: bool,
     do_sample: bool,
     client: ShardedClient,
-) -> Result<(), crossterm::ErrorKind> {
+) -> Result<(), std::io::Error> {
     let parameters = NextTokenChooserParameters {
         temperature: temperature.unwrap_or(1.0),
         top_k: top_k.unwrap_or(0),
@@ -42,7 +43,10 @@ pub async fn run(
         do_sample,
         seed: 0,
         repetition_penalty: repetition_penalty.unwrap_or(1.0),
+        frequency_penalty: frequency_penalty.unwrap_or(0.0),
         watermark,
+        grammar: String::new(),
+        grammar_type: GrammarType::None as i32,
     };
 
     // Initialize terminal properties
@@ -140,6 +144,7 @@ pub async fn run(
         top_p,
         typical_p,
         repetition_penalty,
+        frequency_penalty,
         watermark,
         do_sample,
     );
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 97c8af1c..2ee3d7c5 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -4,7 +4,7 @@
 /// and: https://github.com/orhun/rust-tui-template
 use clap::Parser;
 use std::path::Path;
-use text_generation_client::ShardedClient;
+use text_generation_client::v3::ShardedClient;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
@@ -84,6 +84,11 @@ struct Args {
     #[clap(long, env)]
     repetition_penalty: Option<f32>,
 
+    /// Generation parameter in case you want to specifically test/debug particular
+    /// decoding strategies, for full doc refer to the `text-generation-server`
+    #[clap(long, env)]
+    frequency_penalty: Option<f32>,
+
     /// Generation parameter in case you want to specifically test/debug particular
     /// decoding strategies, for full doc refer to the `text-generation-server`
     #[clap(long, env)]
@@ -119,6 +124,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         top_p,
         typical_p,
         repetition_penalty,
+        frequency_penalty,
         watermark,
         do_sample,
         master_shard_uds_path,
@@ -141,7 +147,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             tracing::info!("Downloading tokenizer");
 
             // Parse Huggingface hub token
-            let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+            let auth_token = std::env::var("HF_TOKEN")
+                .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+                .ok();
 
             // Download and instantiate tokenizer
             // We need to download it outside of the Tokio runtime
@@ -187,6 +195,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 top_p,
                 typical_p,
                 repetition_penalty,
+                frequency_penalty,
                 watermark,
                 do_sample,
                 sharded_client,
diff --git a/benchmark/src/table.rs b/benchmark/src/table.rs
index 9e36717b..1585a25f 100644
--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@@ -15,6 +15,7 @@ pub(crate) fn parameters_table(
     top_p: Option<f32>,
     typical_p: Option<f32>,
     repetition_penalty: Option<f32>,
+    frequency_penalty: Option<f32>,
     watermark: bool,
     do_sample: bool,
 ) -> Table {
@@ -33,6 +34,7 @@ pub(crate) fn parameters_table(
     builder.push_record(["Top P", &format!("{top_p:?}")]);
     builder.push_record(["Typical P", &format!("{typical_p:?}")]);
     builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
+    builder.push_record(["Frequency Penalty", &format!("{frequency_penalty:?}")]);
     builder.push_record(["Watermark", &watermark.to_string()]);
     builder.push_record(["Do Sample", &do_sample.to_string()]);
 
@@ -149,22 +151,22 @@ fn add_throuhgputs(
     }
 }
 
-fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
+fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
     let average = data.iter().sum::<f64>() / data.len() as f64;
     let min = data
         .iter()
         .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     let max = data
         .iter()
         .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
     (average, *min, *max)
 }
 
-fn px(data: &Vec<f64>, p: u32) -> f64 {
+fn px(data: &[f64], p: u32) -> f64 {
     let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
-    *data.get(i).unwrap_or(&std::f64::NAN)
+    *data.get(i).unwrap_or(&f64::NAN)
 }
 
 fn format_value(value: f64, unit: &'static str) -> String {
diff --git a/benchmark/src/utils.rs b/benchmark/src/utils.rs
index d096d655..20469991 100644
--- a/benchmark/src/utils.rs
+++ b/benchmark/src/utils.rs
@@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f
         .iter()
         .map(|&p| {
             let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
-            (format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
+            (format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
         })
         .collect()
 }
diff --git a/clients/python/.gitignore b/clients/python/.gitignore
index 5758ba92..5a8ecaa7 100644
--- a/clients/python/.gitignore
+++ b/clients/python/.gitignore
@@ -155,4 +155,4 @@ dmypy.json
 cython_debug/
 
 transformers
-safetensors
\ No newline at end of file
+safetensors
diff --git a/clients/python/Makefile b/clients/python/Makefile
index 8b4334bd..42720875 100644
--- a/clients/python/Makefile
+++ b/clients/python/Makefile
@@ -3,4 +3,4 @@ unit-tests:
 
 install:
 	pip install pip --upgrade
-	pip install -e .
\ No newline at end of file
+	pip install -e .
diff --git a/clients/python/README.md b/clients/python/README.md
index 4e0e564c..bf37508e 100644
--- a/clients/python/README.md
+++ b/clients/python/README.md
@@ -107,7 +107,19 @@ print(text)
 ### Types
 
 ```python
-# Request Parameters
+# enum for grammar type
+class GrammarType(Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar:
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
 class Parameters:
     # Activate logits sampling
     do_sample: bool
@@ -116,6 +128,10 @@ class Parameters:
     # The parameter for repetition penalty. 1.0 means no penalty.
     # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     repetition_penalty: Optional[float]
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float]
     # Whether to prepend the prompt to the generated text
     return_full_text: bool
     # Stop generating tokens if a member of `stop_sequences` is generated
@@ -138,8 +154,22 @@ class Parameters:
     best_of: Optional[int]
     # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
     watermark: bool
+    # Get generation details
+    details: bool
     # Get decoder input token logprobs and ids
     decoder_input_details: bool
+    # Return the N most likely tokens at each step
+    top_n_tokens: Optional[int]
+    # grammar to use for generation
+    grammar: Optional[Grammar]
+
+class Request:
+    # Prompt
+    inputs: str
+    # Generation parameters
+    parameters: Optional[Parameters]
+    # Whether to stream output tokens
+    stream: bool
 
 # Decoder input tokens
 class InputToken:
@@ -159,7 +189,7 @@ class Token:
     # Token text
     text: str
     # Logprob
-    logprob: float
+    logprob: Optional[float]
     # Is the token a special token
     # Can be used to ignore tokens when concatenating
     special: bool
@@ -189,6 +219,8 @@ class BestOfSequence:
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
 
 
 # `generate` details
@@ -203,6 +235,8 @@ class Details:
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
+    # Most likely tokens
+    top_tokens: Optional[List[List[Token]]]
     # Additional sequences when using the `best_of` parameter
     best_of_sequences: Optional[List[BestOfSequence]]
 
@@ -229,6 +263,8 @@ class StreamDetails:
 class StreamResponse:
     # Generated token
     token: Token
+    # Most likely tokens
+    top_tokens: Optional[List[Token]]
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str]
@@ -240,4 +276,4 @@ class StreamResponse:
 class DeployedModel:
     model_id: str
     sha: str
-```
\ No newline at end of file
+```
diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock
index e038ad9b..148d9906 100644
--- a/clients/python/poetry.lock
+++ b/clients/python/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -124,6 +124,20 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.5.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "annotated_types-0.5.0-py3-none-any.whl", hash = "sha256:58da39888f92c276ad970249761ebea80ba544b77acddaa1a4d6cf78287d45fd"},
+    {file = "annotated_types-0.5.0.tar.gz", hash = "sha256:47cdc3490d9ac1506ce92c7aaa76c579dc3509ff11e098fc867e5130ab7be802"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""}
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -693,55 +707,140 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "1.10.12"
-description = "Data validation and settings management using python type hints"
+version = "2.5.3"
+description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"},
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"},
-    {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"},
-    {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"},
-    {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"},
-    {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"},
-    {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"},
-    {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"},
-    {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"},
-    {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"},
+    {file = "pydantic-2.5.3-py3-none-any.whl", hash = "sha256:d0caf5954bee831b6bfe7e338c32b9e30c85dfe080c843680783ac2b631673b4"},
+    {file = "pydantic-2.5.3.tar.gz", hash = "sha256:b3ef57c62535b0941697cce638c08900d87fcb67e29cfa99e8a68f747f393f7a"},
 ]
 
 [package.dependencies]
-typing-extensions = ">=4.2.0"
+annotated-types = ">=0.4.0"
+importlib-metadata = {version = "*", markers = "python_version == \"3.7\""}
+pydantic-core = "2.14.6"
+typing-extensions = ">=4.6.1"
 
 [package.extras]
-dotenv = ["python-dotenv (>=0.10.4)"]
-email = ["email-validator (>=1.0.3)"]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.14.6"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:72f9a942d739f09cd42fffe5dc759928217649f070056f03c70df14f5770acf9"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6a31d98c0d69776c2576dda4b77b8e0c69ad08e8b539c25c7d0ca0dc19a50d6c"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aa90562bc079c6c290f0512b21768967f9968e4cfea84ea4ff5af5d917016e4"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:370ffecb5316ed23b667d99ce4debe53ea664b99cc37bfa2af47bc769056d534"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f85f3843bdb1fe80e8c206fe6eed7a1caeae897e496542cee499c374a85c6e08"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9862bf828112e19685b76ca499b379338fd4c5c269d897e218b2ae8fcb80139d"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036137b5ad0cb0004c75b579445a1efccd072387a36c7f217bb8efd1afbe5245"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92879bce89f91f4b2416eba4429c7b5ca22c45ef4a499c39f0c5c69257522c7c"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0c08de15d50fa190d577e8591f0329a643eeaed696d7771760295998aca6bc66"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:36099c69f6b14fc2c49d7996cbf4f87ec4f0e66d1c74aa05228583225a07b590"},
+    {file = "pydantic_core-2.14.6-cp310-none-win32.whl", hash = "sha256:7be719e4d2ae6c314f72844ba9d69e38dff342bc360379f7c8537c48e23034b7"},
+    {file = "pydantic_core-2.14.6-cp310-none-win_amd64.whl", hash = "sha256:36fa402dcdc8ea7f1b0ddcf0df4254cc6b2e08f8cd80e7010d4c4ae6e86b2a87"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:dea7fcd62915fb150cdc373212141a30037e11b761fbced340e9db3379b892d4"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffff855100bc066ff2cd3aa4a60bc9534661816b110f0243e59503ec2df38421"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b027c86c66b8627eb90e57aee1f526df77dc6d8b354ec498be9a757d513b92b"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:00b1087dabcee0b0ffd104f9f53d7d3eaddfaa314cdd6726143af6bc713aa27e"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:75ec284328b60a4e91010c1acade0c30584f28a1f345bc8f72fe8b9e46ec6a96"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e1f4744eea1501404b20b0ac059ff7e3f96a97d3e3f48ce27a139e053bb370b"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2602177668f89b38b9f84b7b3435d0a72511ddef45dc14446811759b82235a1"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c8edaea3089bf908dd27da8f5d9e395c5b4dc092dbcce9b65e7156099b4b937"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:478e9e7b360dfec451daafe286998d4a1eeaecf6d69c427b834ae771cad4b622"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b6ca36c12a5120bad343eef193cc0122928c5c7466121da7c20f41160ba00ba2"},
+    {file = "pydantic_core-2.14.6-cp311-none-win32.whl", hash = "sha256:2b8719037e570639e6b665a4050add43134d80b687288ba3ade18b22bbb29dd2"},
+    {file = "pydantic_core-2.14.6-cp311-none-win_amd64.whl", hash = "sha256:78ee52ecc088c61cce32b2d30a826f929e1708f7b9247dc3b921aec367dc1b23"},
+    {file = "pydantic_core-2.14.6-cp311-none-win_arm64.whl", hash = "sha256:a19b794f8fe6569472ff77602437ec4430f9b2b9ec7a1105cfd2232f9ba355e6"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:667aa2eac9cd0700af1ddb38b7b1ef246d8cf94c85637cbb03d7757ca4c3fdec"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdee837710ef6b56ebd20245b83799fce40b265b3b406e51e8ccc5b85b9099b7"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c5bcf3414367e29f83fd66f7de64509a8fd2368b1edf4351e862910727d3e51"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:26a92ae76f75d1915806b77cf459811e772d8f71fd1e4339c99750f0e7f6324f"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a983cca5ed1dd9a35e9e42ebf9f278d344603bfcb174ff99a5815f953925140a"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cb92f9061657287eded380d7dc455bbf115430b3aa4741bdc662d02977e7d0af"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4ace1e220b078c8e48e82c081e35002038657e4b37d403ce940fa679e57113b"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef633add81832f4b56d3b4c9408b43d530dfca29e68fb1b797dcb861a2c734cd"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e90d6cc4aad2cc1f5e16ed56e46cebf4877c62403a311af20459c15da76fd91"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e8a5ac97ea521d7bde7621d86c30e86b798cdecd985723c4ed737a2aa9e77d0c"},
+    {file = "pydantic_core-2.14.6-cp312-none-win32.whl", hash = "sha256:f27207e8ca3e5e021e2402ba942e5b4c629718e665c81b8b306f3c8b1ddbb786"},
+    {file = "pydantic_core-2.14.6-cp312-none-win_amd64.whl", hash = "sha256:b3e5fe4538001bb82e2295b8d2a39356a84694c97cb73a566dc36328b9f83b40"},
+    {file = "pydantic_core-2.14.6-cp312-none-win_arm64.whl", hash = "sha256:64634ccf9d671c6be242a664a33c4acf12882670b09b3f163cd00a24cffbd74e"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:24368e31be2c88bd69340fbfe741b405302993242ccb476c5c3ff48aeee1afe0"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e33b0834f1cf779aa839975f9d8755a7c2420510c0fa1e9fa0497de77cd35d2c"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6af4b3f52cc65f8a0bc8b1cd9676f8c21ef3e9132f21fed250f6958bd7223bed"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d15687d7d7f40333bd8266f3814c591c2e2cd263fa2116e314f60d82086e353a"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:095b707bb287bfd534044166ab767bec70a9bba3175dcdc3371782175c14e43c"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94fc0e6621e07d1e91c44e016cc0b189b48db053061cc22d6298a611de8071bb"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ce830e480f6774608dedfd4a90c42aac4a7af0a711f1b52f807130c2e434c06"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a306cdd2ad3a7d795d8e617a58c3a2ed0f76c8496fb7621b6cd514eb1532cae8"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2f5fa187bde8524b1e37ba894db13aadd64faa884657473b03a019f625cee9a8"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:438027a975cc213a47c5d70672e0d29776082155cfae540c4e225716586be75e"},
+    {file = "pydantic_core-2.14.6-cp37-none-win32.whl", hash = "sha256:f96ae96a060a8072ceff4cfde89d261837b4294a4f28b84a28765470d502ccc6"},
+    {file = "pydantic_core-2.14.6-cp37-none-win_amd64.whl", hash = "sha256:e646c0e282e960345314f42f2cea5e0b5f56938c093541ea6dbf11aec2862391"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:db453f2da3f59a348f514cfbfeb042393b68720787bbef2b4c6068ea362c8149"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3860c62057acd95cc84044e758e47b18dcd8871a328ebc8ccdefd18b0d26a21b"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36026d8f99c58d7044413e1b819a67ca0e0b8ebe0f25e775e6c3d1fabb3c38fb"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8ed1af8692bd8d2a29d702f1a2e6065416d76897d726e45a1775b1444f5928a7"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:314ccc4264ce7d854941231cf71b592e30d8d368a71e50197c905874feacc8a8"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:982487f8931067a32e72d40ab6b47b1628a9c5d344be7f1a4e668fb462d2da42"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dbe357bc4ddda078f79d2a36fc1dd0494a7f2fad83a0a684465b6f24b46fe80"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2f6ffc6701a0eb28648c845f4945a194dc7ab3c651f535b81793251e1185ac3d"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7f5025db12fc6de7bc1104d826d5aee1d172f9ba6ca936bf6474c2148ac336c1"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dab03ed811ed1c71d700ed08bde8431cf429bbe59e423394f0f4055f1ca0ea60"},
+    {file = "pydantic_core-2.14.6-cp38-none-win32.whl", hash = "sha256:dfcbebdb3c4b6f739a91769aea5ed615023f3c88cb70df812849aef634c25fbe"},
+    {file = "pydantic_core-2.14.6-cp38-none-win_amd64.whl", hash = "sha256:99b14dbea2fdb563d8b5a57c9badfcd72083f6006caf8e126b491519c7d64ca8"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:4ce8299b481bcb68e5c82002b96e411796b844d72b3e92a3fbedfe8e19813eab"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b9a9d92f10772d2a181b5ca339dee066ab7d1c9a34ae2421b2a52556e719756f"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd9e98b408384989ea4ab60206b8e100d8687da18b5c813c11e92fd8212a98e0"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4f86f1f318e56f5cbb282fe61eb84767aee743ebe32c7c0834690ebea50c0a6b"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86ce5fcfc3accf3a07a729779d0b86c5d0309a4764c897d86c11089be61da160"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dcf1978be02153c6a31692d4fbcc2a3f1db9da36039ead23173bc256ee3b91b"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eedf97be7bc3dbc8addcef4142f4b4164066df0c6f36397ae4aaed3eb187d8ab"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5f916acf8afbcab6bacbb376ba7dc61f845367901ecd5e328fc4d4aef2fcab0"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8a14c192c1d724c3acbfb3f10a958c55a2638391319ce8078cb36c02283959b9"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0348b1dc6b76041516e8a854ff95b21c55f5a411c3297d2ca52f5528e49d8411"},
+    {file = "pydantic_core-2.14.6-cp39-none-win32.whl", hash = "sha256:de2a0645a923ba57c5527497daf8ec5df69c6eadf869e9cd46e86349146e5975"},
+    {file = "pydantic_core-2.14.6-cp39-none-win_amd64.whl", hash = "sha256:aca48506a9c20f68ee61c87f2008f81f8ee99f8d7f0104bff3c47e2d148f89d9"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d5c28525c19f5bb1e09511669bb57353d22b94cf8b65f3a8d141c389a55dec95"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:78d0768ee59baa3de0f4adac9e3748b4b1fffc52143caebddfd5ea2961595277"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b93785eadaef932e4fe9c6e12ba67beb1b3f1e5495631419c784ab87e975670"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a874f21f87c485310944b2b2734cd6d318765bcbb7515eead33af9641816506e"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89f4477d915ea43b4ceea6756f63f0288941b6443a2b28c69004fe07fde0d0d"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:172de779e2a153d36ee690dbc49c6db568d7b33b18dc56b69a7514aecbcf380d"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dfcebb950aa7e667ec226a442722134539e77c575f6cfaa423f24371bb8d2e94"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:55a23dcd98c858c0db44fc5c04fc7ed81c4b4d33c653a7c45ddaebf6563a2f66"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4241204e4b36ab5ae466ecec5c4c16527a054c69f99bba20f6f75232a6a534e2"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e574de99d735b3fc8364cba9912c2bec2da78775eba95cbb225ef7dda6acea24"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1302a54f87b5cd8528e4d6d1bf2133b6aa7c6122ff8e9dc5220fbc1e07bffebd"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8e81e4b55930e5ffab4a68db1af431629cf2e4066dbdbfef65348b8ab804ea8"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c99462ffc538717b3e60151dfaf91125f637e801f5ab008f81c402f1dff0cd0f"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e4cf2d5829f6963a5483ec01578ee76d329eb5caf330ecd05b3edd697e7d768a"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cf10b7d58ae4a1f07fccbf4a0a956d705356fea05fb4c70608bb6fa81d103cda"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:399ac0891c284fa8eb998bcfa323f2234858f5d2efca3950ae58c8f88830f145"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c6a5c79b28003543db3ba67d1df336f253a87d3112dac3a51b94f7d48e4c0e1"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:599c87d79cab2a6a2a9df4aefe0455e61e7d2aeede2f8577c1b7c0aec643ee8e"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43e166ad47ba900f2542a80d83f9fc65fe99eb63ceec4debec160ae729824052"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a0b5db001b98e1c649dd55afa928e75aa4087e587b9524a4992316fa23c9fba"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:747265448cb57a9f37572a488a57d873fd96bf51e5bb7edb52cfb37124516da4"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7ebe3416785f65c28f4f9441e916bfc8a54179c8dea73c23023f7086fa601c5d"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:86c963186ca5e50d5c8287b1d1c9d3f8f024cbe343d048c5bd282aec2d8641f2"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e0641b506486f0b4cd1500a2a65740243e8670a2549bb02bc4556a83af84ae03"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71d72ca5eaaa8d38c8df16b7deb1a2da4f650c41b58bb142f3fb75d5ad4a611f"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27e524624eace5c59af499cd97dc18bb201dc6a7a2da24bfc66ef151c69a5f2a"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3dde6cac75e0b0902778978d3b1646ca9f438654395a362cb21d9ad34b24acf"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:00646784f6cd993b1e1c0e7b0fdcbccc375d539db95555477771c27555e3c556"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:23598acb8ccaa3d1d875ef3b35cb6376535095e9405d91a3d57a8c7db5d29341"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7f41533d7e3cf9520065f610b41ac1c76bc2161415955fbcead4981b22c7611e"},
+    {file = "pydantic_core-2.14.6.tar.gz", hash = "sha256:1fd0c1d395372843fba13a51c28e3bb9d59bd7aebfeb17358ffaaa1e4dbbe948"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
 name = "pytest"
@@ -816,6 +915,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -823,8 +923,16 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -841,6 +949,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -848,6 +957,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -929,13 +1039,13 @@ files = [
 
 [[package]]
 name = "urllib3"
-version = "2.0.4"
+version = "2.0.5"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"},
-    {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"},
+    {file = "urllib3-2.0.5-py3-none-any.whl", hash = "sha256:ef16afa8ba34a1f989db38e1dbbe0c302e4289a47856990d0682e374563ce35e"},
+    {file = "urllib3-2.0.5.tar.gz", hash = "sha256:13abf37382ea2ce6fb744d4dad67838eec857c9f4f57009891805e0b5e123594"},
 ]
 
 [package.extras]
@@ -1050,4 +1160,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.7"
-content-hash = "0db2f97d52c557dd7f90c55b4ad5bbe308c957c5f7f99fec53c57e0a13822cb4"
+content-hash = "b7fab8703967f2616ea59a98a437cd30f97f0c8d2a06e399d688814a2a2c64f8"
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index a52bdd81..2925085b 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.6.0"
+version = "0.7.0"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
@@ -12,7 +12,7 @@ repository = "https://github.com/huggingface/text-generation-inference"
 
 [tool.poetry.dependencies]
 python = "^3.7"
-pydantic = "^1.10"
+pydantic = "> 2, < 3"
 aiohttp = "^3.8"
 huggingface-hub = ">= 0.12, < 1.0"
 
diff --git a/clients/python/tests/conftest.py b/clients/python/tests/conftest.py
index 48734f0d..17bb73b5 100644
--- a/clients/python/tests/conftest.py
+++ b/clients/python/tests/conftest.py
@@ -9,6 +9,11 @@ def flan_t5_xxl():
     return "google/flan-t5-xxl"
 
 
+@pytest.fixture
+def llama_7b():
+    return "meta-llama/Llama-2-7b-chat-hf"
+
+
 @pytest.fixture
 def fake_model():
     return "fake/model"
@@ -34,6 +39,11 @@ def flan_t5_xxl_url(base_url, flan_t5_xxl):
     return f"{base_url}/{flan_t5_xxl}"
 
 
+@pytest.fixture
+def llama_7b_url(base_url, llama_7b):
+    return f"{base_url}/{llama_7b}"
+
+
 @pytest.fixture
 def fake_url(base_url, fake_model):
     return f"{base_url}/{fake_model}"
diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
index 1e25e1b1..8aed865b 100644
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@@ -5,24 +5,24 @@ from text_generation.errors import NotFoundError, ValidationError
 from text_generation.types import FinishReason, InputToken
 
 
-def test_generate(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     response = client.generate("test", max_new_tokens=1, decoder_input_details=True)
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
-    assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == InputToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.prefill) == 2
+    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
     assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 3
-    assert response.details.tokens[0].text == " "
+    assert response.details.tokens[0].id == 29918
+    assert response.details.tokens[0].text == "_"
     assert not response.details.tokens[0].special
 
 
-def test_generate_best_of(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_best_of(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     response = client.generate(
         "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
     )
@@ -39,14 +39,14 @@ def test_generate_not_found(fake_url, hf_headers):
         client.generate("test")
 
 
-def test_generate_validation_error(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_validation_error(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         client.generate("test", max_new_tokens=10_000)
 
 
-def test_generate_stream(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_stream(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     responses = [
         response for response in client.generate_stream("test", max_new_tokens=1)
     ]
@@ -54,7 +54,7 @@ def test_generate_stream(flan_t5_xxl_url, hf_headers):
     assert len(responses) == 1
     response = responses[0]
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
@@ -66,34 +66,37 @@ def test_generate_stream_not_found(fake_url, hf_headers):
         list(client.generate_stream("test"))
 
 
-def test_generate_stream_validation_error(flan_t5_xxl_url, hf_headers):
-    client = Client(flan_t5_xxl_url, hf_headers)
+def test_generate_stream_validation_error(llama_7b_url, hf_headers):
+    client = Client(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         list(client.generate_stream("test", max_new_tokens=10_000))
 
 
 @pytest.mark.asyncio
-async def test_generate_async(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     response = await client.generate(
         "test", max_new_tokens=1, decoder_input_details=True
     )
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
-    assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == InputToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.prefill) == 2
+    assert response.details.prefill[0] == InputToken(id=1, text="<s>", logprob=None)
+    assert response.details.prefill[1] == InputToken(
+        id=1243, text="test", logprob=-10.96875
+    )
     assert len(response.details.tokens) == 1
-    assert response.details.tokens[0].id == 3
-    assert response.details.tokens[0].text == " "
+    assert response.details.tokens[0].id == 29918
+    assert response.details.tokens[0].text == "_"
     assert not response.details.tokens[0].special
 
 
 @pytest.mark.asyncio
-async def test_generate_async_best_of(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async_best_of(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     response = await client.generate(
         "test", max_new_tokens=1, best_of=2, do_sample=True, decoder_input_details=True
     )
@@ -112,15 +115,15 @@ async def test_generate_async_not_found(fake_url, hf_headers):
 
 
 @pytest.mark.asyncio
-async def test_generate_async_validation_error(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_async_validation_error(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         await client.generate("test", max_new_tokens=10_000)
 
 
 @pytest.mark.asyncio
-async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_stream_async(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     responses = [
         response async for response in client.generate_stream("test", max_new_tokens=1)
     ]
@@ -128,7 +131,7 @@ async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
     assert len(responses) == 1
     response = responses[0]
 
-    assert response.generated_text == ""
+    assert response.generated_text == "_"
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
@@ -143,8 +146,8 @@ async def test_generate_stream_async_not_found(fake_url, hf_headers):
 
 
 @pytest.mark.asyncio
-async def test_generate_stream_async_validation_error(flan_t5_xxl_url, hf_headers):
-    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+async def test_generate_stream_async_validation_error(llama_7b_url, hf_headers):
+    client = AsyncClient(llama_7b_url, hf_headers)
     with pytest.raises(ValidationError):
         async for _ in client.generate_stream("test", max_new_tokens=10_000):
             pass
diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py
index 46109833..d7a09c9e 100644
--- a/clients/python/text_generation/__init__.py
+++ b/clients/python/text_generation/__init__.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.3.0"
+__version__ = "0.7.0"
+
+DEPRECATION_WARNING = (
+    "`text_generation` clients are deprecated and will be removed in the near future. "
+    "Please use the `InferenceClient` from the `huggingface_hub` package instead."
+)
 
 from text_generation.client import Client, AsyncClient
 from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
index 015613c2..12966747 100644
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@@ -1,18 +1,32 @@
 import json
 import requests
+import warnings
 
 from aiohttp import ClientSession, ClientTimeout
 from pydantic import ValidationError
-from typing import Dict, Optional, List, AsyncIterator, Iterator
+from typing import Dict, Optional, List, AsyncIterator, Iterator, Union
 
+from text_generation import DEPRECATION_WARNING
 from text_generation.types import (
     StreamResponse,
     Response,
     Request,
     Parameters,
+    Grammar,
+    CompletionRequest,
+    Completion,
+    CompletionComplete,
+    ChatRequest,
+    ChatCompletionChunk,
+    ChatComplete,
+    Message,
+    Tool,
 )
 from text_generation.errors import parse_error
 
+# emit deprecation warnings
+warnings.simplefilter("always", DeprecationWarning)
+
 
 class Client:
     """Client to make calls to a text-generation-inference instance
@@ -53,11 +67,222 @@ class Client:
             timeout (`int`):
                 Timeout in seconds
         """
+        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
         self.base_url = base_url
         self.headers = headers
         self.cookies = cookies
         self.timeout = timeout
 
+    def completion(
+        self,
+        prompt: str,
+        frequency_penalty: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+    ):
+        """
+        Given a prompt, generate a response synchronously
+
+        Args:
+            prompt (`str`):
+                Prompt
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            seed (`int`):
+                Random sampling seed
+            stream (`bool`):
+                Stream the response
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = CompletionRequest(
+            model="tgi",
+            prompt=prompt,
+            frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+            stream=stream,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return Completion(**payload)
+        else:
+            return self._completion_stream_response(request)
+
+    def _completion_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = CompletionComplete(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+
+    def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_prompt: Optional[str] = None,
+        tool_choice: Optional[str] = None,
+        stop: Optional[List[str]] = None,
+    ):
+        """
+        Given a list of messages, generate a response asynchronously
+
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_prompt (`str`):
+                A prompt to be appended before the tools
+            tool_choice (`str`):
+                The tool to use
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_prompt=tool_prompt,
+            tool_choice=tool_choice,
+            stop=stop,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/chat/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return ChatComplete(**payload)
+        else:
+            return self._chat_stream_response(request)
+
+    def _chat_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = ChatCompletionChunk(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+
     def generate(
         self,
         prompt: str,
@@ -65,6 +290,7 @@ class Client:
         max_new_tokens: int = 20,
         best_of: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
         stop_sequences: Optional[List[str]] = None,
@@ -76,6 +302,7 @@ class Client:
         watermark: bool = False,
         decoder_input_details: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> Response:
         """
         Given a prompt, generate the following text
@@ -92,6 +319,10 @@ class Client:
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
             return_full_text (`bool`):
                 Whether to prepend the prompt to the generated text
             seed (`int`):
@@ -116,6 +347,9 @@ class Client:
                 Return the decoder input token logprobs and ids
             top_n_tokens (`int`):
                 Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
 
         Returns:
             Response: generated response
@@ -127,6 +361,7 @@ class Client:
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
             repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
             return_full_text=return_full_text,
             seed=seed,
             stop=stop_sequences if stop_sequences is not None else [],
@@ -137,7 +372,8 @@ class Client:
             typical_p=typical_p,
             watermark=watermark,
             decoder_input_details=decoder_input_details,
-            top_n_tokens=top_n_tokens
+            top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -159,6 +395,7 @@ class Client:
         do_sample: bool = False,
         max_new_tokens: int = 20,
         repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
         stop_sequences: Optional[List[str]] = None,
@@ -169,6 +406,7 @@ class Client:
         typical_p: Optional[float] = None,
         watermark: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> Iterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens
@@ -183,6 +421,10 @@ class Client:
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
             return_full_text (`bool`):
                 Whether to prepend the prompt to the generated text
             seed (`int`):
@@ -205,6 +447,9 @@ class Client:
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
             top_n_tokens (`int`):
                 Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
 
         Returns:
             Iterator[StreamResponse]: stream of generated tokens
@@ -217,6 +462,7 @@ class Client:
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
             repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
             return_full_text=return_full_text,
             seed=seed,
             stop=stop_sequences if stop_sequences is not None else [],
@@ -227,6 +473,7 @@ class Client:
             typical_p=typical_p,
             watermark=watermark,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
@@ -303,10 +550,219 @@ class AsyncClient:
             timeout (`int`):
                 Timeout in seconds
         """
+        warnings.warn(DEPRECATION_WARNING, DeprecationWarning)
         self.base_url = base_url
         self.headers = headers
         self.cookies = cookies
-        self.timeout = ClientTimeout(timeout * 60)
+        self.timeout = ClientTimeout(timeout)
+
+    async def completion(
+        self,
+        prompt: str,
+        frequency_penalty: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stream: bool = False,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[Completion, AsyncIterator[CompletionComplete]]:
+        """
+        Given a prompt, generate a response asynchronously
+
+        Args:
+            prompt (`str`):
+                Prompt
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            seed (`int`):
+                Random sampling seed
+            stream (`bool`):
+                Stream the response
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+        """
+        request = CompletionRequest(
+            model="tgi",
+            prompt=prompt,
+            frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
+            repetition_penalty=repetition_penalty,
+            seed=seed,
+            stream=stream,
+            temperature=temperature,
+            top_p=top_p,
+            stop=stop,
+        )
+        if not stream:
+            return await self._completion_single_response(request)
+        else:
+            return self._completion_stream_response(request)
+
+    async def _completion_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return Completion(**payload)
+
+    async def _completion_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = CompletionComplete(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)
+
+    async def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_prompt: Optional[str] = None,
+        tool_choice: Optional[str] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
+        """
+        Given a list of messages, generate a response asynchronously
+
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_prompt (`str`):
+                A prompt to be appended before the tools
+            tool_choice (`str`):
+                The tool to use
+            stop (`List[str]`):
+                Stop generating tokens if a member of `stop` is generated
+
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_prompt=tool_prompt,
+            tool_choice=tool_choice,
+            stop=stop,
+        )
+        if not stream:
+            return await self._chat_single_response(request)
+        else:
+            return self._chat_stream_response(request)
+
+    async def _chat_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return ChatComplete(**payload)
+
+    async def _chat_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = ChatCompletionChunk(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)
 
     async def generate(
         self,
@@ -315,6 +771,7 @@ class AsyncClient:
         max_new_tokens: int = 20,
         best_of: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
         stop_sequences: Optional[List[str]] = None,
@@ -326,6 +783,7 @@ class AsyncClient:
         watermark: bool = False,
         decoder_input_details: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> Response:
         """
         Given a prompt, generate the following text asynchronously
@@ -342,6 +800,10 @@ class AsyncClient:
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
             return_full_text (`bool`):
                 Whether to prepend the prompt to the generated text
             seed (`int`):
@@ -366,10 +828,14 @@ class AsyncClient:
                 Return the decoder input token logprobs and ids
             top_n_tokens (`int`):
                 Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
 
         Returns:
             Response: generated response
         """
+
         # Validate parameters
         parameters = Parameters(
             best_of=best_of,
@@ -378,6 +844,7 @@ class AsyncClient:
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
             repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
             return_full_text=return_full_text,
             seed=seed,
             stop=stop_sequences if stop_sequences is not None else [],
@@ -388,6 +855,7 @@ class AsyncClient:
             typical_p=typical_p,
             watermark=watermark,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
 
@@ -407,6 +875,7 @@ class AsyncClient:
         do_sample: bool = False,
         max_new_tokens: int = 20,
         repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
         stop_sequences: Optional[List[str]] = None,
@@ -417,6 +886,7 @@ class AsyncClient:
         typical_p: Optional[float] = None,
         watermark: bool = False,
         top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
     ) -> AsyncIterator[StreamResponse]:
         """
         Given a prompt, generate the following stream of tokens asynchronously
@@ -431,6 +901,10 @@ class AsyncClient:
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
             return_full_text (`bool`):
                 Whether to prepend the prompt to the generated text
             seed (`int`):
@@ -453,6 +927,9 @@ class AsyncClient:
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
             top_n_tokens (`int`):
                 Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.
 
         Returns:
             AsyncIterator[StreamResponse]: stream of generated tokens
@@ -465,6 +942,7 @@ class AsyncClient:
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
             repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
             return_full_text=return_full_text,
             seed=seed,
             stop=stop_sequences if stop_sequences is not None else [],
@@ -475,6 +953,7 @@ class AsyncClient:
             typical_p=typical_p,
             watermark=watermark,
             top_n_tokens=top_n_tokens,
+            grammar=grammar,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
 
@@ -482,7 +961,6 @@ class AsyncClient:
             headers=self.headers, cookies=self.cookies, timeout=self.timeout
         ) as session:
             async with session.post(self.base_url, json=request.dict()) as resp:
-
                 if resp.status != 200:
                     raise parse_error(resp.status, await resp.json())
 
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index 38f75253..a56edaca 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -1,10 +1,198 @@
 from enum import Enum
-from pydantic import BaseModel, validator
-from typing import Optional, List
+from pydantic import BaseModel, field_validator, ConfigDict
+from typing import Optional, List, Union, Any
 
 from text_generation.errors import ValidationError
 
 
+# enum for grammar type
+class GrammarType(str, Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar(BaseModel):
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
+
+class ToolCall(BaseModel):
+    # Id of the tool call
+    id: int
+    # Type of the tool call
+    type: str
+    # Function details of the tool call
+    function: dict
+
+
+class Message(BaseModel):
+    # Role of the message sender
+    role: str
+    # Content of the message
+    content: Optional[str] = None
+    # Optional name of the message sender
+    name: Optional[str] = None
+    # Tool calls associated with the chat completion
+    tool_calls: Optional[Any] = None
+
+
+class Tool(BaseModel):
+    # Type of the tool
+    type: str
+    # Function details of the tool
+    function: dict
+
+
+class Function(BaseModel):
+    name: Optional[str]
+    arguments: str
+
+
+class ChoiceDeltaToolCall(BaseModel):
+    index: int
+    id: str
+    type: str
+    function: Function
+
+
+class ChoiceDelta(BaseModel):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[ChoiceDeltaToolCall]
+
+
+class Choice(BaseModel):
+    index: int
+    delta: ChoiceDelta
+    logprobs: Optional[dict] = None
+    finish_reason: Optional[str] = None
+
+
+class CompletionRequest(BaseModel):
+    # Model identifier
+    model: str
+    # Prompt
+    prompt: str
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+
+
+class CompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    text: str
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+
+
+class Completion(BaseModel):
+    # Completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[CompletionComplete]
+
+
+class ChatRequest(BaseModel):
+    # Model identifier
+    model: str
+    # List of messages in the conversation
+    messages: List[Message]
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Bias values for token selection
+    logit_bias: Optional[List[float]] = None
+    # Whether to return log probabilities
+    logprobs: Optional[bool] = None
+    # Number of most likely tokens to return at each position
+    top_logprobs: Optional[int] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Number of chat completion choices to generate
+    n: Optional[int] = None
+    # Penalty for presence of new tokens
+    presence_penalty: Optional[float] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # List of tools to be used
+    tools: Optional[List[Tool]] = None
+    # A prompt to be appended before the tools
+    tool_prompt: Optional[str] = None
+    # Choice of tool to be used
+    tool_choice: Optional[str] = None
+    # Stop generating tokens if a member of `stop` is generated
+    stop: Optional[List[str]] = None
+
+
+class ChatCompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    message: Message
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+    # Usage details of the chat completion
+    usage: Optional[Any] = None
+
+
+class ChatComplete(BaseModel):
+    # Chat completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[ChatCompletionComplete]
+    usage: Any
+
+
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[Choice]
+
+
 class Parameters(BaseModel):
     # Activate logits sampling
     do_sample: bool = False
@@ -13,26 +201,30 @@ class Parameters(BaseModel):
     # The parameter for repetition penalty. 1.0 means no penalty.
     # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
     # Whether to prepend the prompt to the generated text
     return_full_text: bool = False
     # Stop generating tokens if a member of `stop_sequences` is generated
     stop: List[str] = []
     # Random sampling seed
-    seed: Optional[int]
+    seed: Optional[int] = None
     # The value used to module the logits distribution.
-    temperature: Optional[float]
+    temperature: Optional[float] = None
     # The number of highest probability vocabulary tokens to keep for top-k-filtering.
-    top_k: Optional[int]
+    top_k: Optional[int] = None
     # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
     # higher are kept for generation.
-    top_p: Optional[float]
+    top_p: Optional[float] = None
     # truncate inputs tokens to the given size
-    truncate: Optional[int]
+    truncate: Optional[int] = None
     # Typical Decoding mass
     # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
-    typical_p: Optional[float]
+    typical_p: Optional[float] = None
     # Generate best_of sequences and return the one if the highest token logprobs
-    best_of: Optional[int]
+    best_of: Optional[int] = None
     # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
     watermark: bool = False
     # Get generation details
@@ -40,100 +232,119 @@ class Parameters(BaseModel):
     # Get decoder input token logprobs and ids
     decoder_input_details: bool = False
     # Return the N most likely tokens at each step
-    top_n_tokens: Optional[int]
+    top_n_tokens: Optional[int] = None
+    # grammar to use for generation
+    grammar: Optional[Grammar] = None
 
-    @validator("best_of")
+    @field_validator("best_of")
     def valid_best_of(cls, field_value, values):
         if field_value is not None:
             if field_value <= 0:
                 raise ValidationError("`best_of` must be strictly positive")
-            if field_value > 1 and values["seed"] is not None:
+            if field_value > 1 and values.data["seed"] is not None:
                 raise ValidationError("`seed` must not be set when `best_of` is > 1")
             sampling = (
-                values["do_sample"]
-                | (values["temperature"] is not None)
-                | (values["top_k"] is not None)
-                | (values["top_p"] is not None)
-                | (values["typical_p"] is not None)
+                values.data["do_sample"]
+                | (values.data["temperature"] is not None)
+                | (values.data["top_k"] is not None)
+                | (values.data["top_p"] is not None)
+                | (values.data["typical_p"] is not None)
             )
             if field_value > 1 and not sampling:
                 raise ValidationError("you must use sampling when `best_of` is > 1")
 
         return field_value
 
-    @validator("repetition_penalty")
+    @field_validator("repetition_penalty")
     def valid_repetition_penalty(cls, v):
         if v is not None and v <= 0:
             raise ValidationError("`repetition_penalty` must be strictly positive")
         return v
 
-    @validator("seed")
+    @field_validator("frequency_penalty")
+    def valid_frequency_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`frequency_penalty` must be strictly positive")
+        return v
+
+    @field_validator("seed")
     def valid_seed(cls, v):
         if v is not None and v < 0:
             raise ValidationError("`seed` must be positive")
         return v
 
-    @validator("temperature")
+    @field_validator("temperature")
     def valid_temp(cls, v):
         if v is not None and v <= 0:
             raise ValidationError("`temperature` must be strictly positive")
         return v
 
-    @validator("top_k")
+    @field_validator("top_k")
     def valid_top_k(cls, v):
         if v is not None and v <= 0:
             raise ValidationError("`top_k` must be strictly positive")
         return v
 
-    @validator("top_p")
+    @field_validator("top_p")
     def valid_top_p(cls, v):
         if v is not None and (v <= 0 or v >= 1.0):
             raise ValidationError("`top_p` must be > 0.0 and < 1.0")
         return v
 
-    @validator("truncate")
+    @field_validator("truncate")
     def valid_truncate(cls, v):
         if v is not None and v <= 0:
             raise ValidationError("`truncate` must be strictly positive")
         return v
 
-    @validator("typical_p")
+    @field_validator("typical_p")
     def valid_typical_p(cls, v):
         if v is not None and (v <= 0 or v >= 1.0):
             raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
         return v
 
-    @validator("top_n_tokens")
+    @field_validator("top_n_tokens")
     def valid_top_n_tokens(cls, v):
         if v is not None and v <= 0:
             raise ValidationError("`top_n_tokens` must be strictly positive")
         return v
 
+    @field_validator("grammar")
+    def valid_grammar(cls, v):
+        if v is not None:
+            if v.type == GrammarType.Regex and not v.value:
+                raise ValidationError("`value` cannot be empty for `regex` grammar")
+            if v.type == GrammarType.Json and not v.value:
+                raise ValidationError("`value` cannot be empty for `json` grammar")
+        return v
+
 
 class Request(BaseModel):
     # Prompt
     inputs: str
     # Generation parameters
-    parameters: Optional[Parameters]
+    parameters: Optional[Parameters] = None
     # Whether to stream output tokens
     stream: bool = False
 
-    @validator("inputs")
+    @field_validator("inputs")
     def valid_input(cls, v):
         if not v:
             raise ValidationError("`inputs` cannot be empty")
         return v
 
-    @validator("stream")
+    @field_validator("stream")
     def valid_best_of_stream(cls, field_value, values):
-        parameters = values["parameters"]
+        parameters = values.data["parameters"]
         if (
             parameters is not None
             and parameters.best_of is not None
             and parameters.best_of > 1
             and field_value
         ):
-            raise ValidationError("`best_of` != 1 is not supported when `stream` == True")
+            raise ValidationError(
+                "`best_of` != 1 is not supported when `stream` == True"
+            )
         return field_value
 
 
@@ -145,7 +356,7 @@ class InputToken(BaseModel):
     text: str
     # Logprob
     # Optional since the logprob of the first token cannot be computed
-    logprob: Optional[float]
+    logprob: Optional[float] = None
 
 
 # Generated tokens
@@ -155,7 +366,7 @@ class Token(BaseModel):
     # Token text
     text: str
     # Logprob
-    logprob: float
+    logprob: Optional[float] = None
     # Is the token a special token
     # Can be used to ignore tokens when concatenating
     special: bool
@@ -180,13 +391,13 @@ class BestOfSequence(BaseModel):
     # Number of generated tokens
     generated_tokens: int
     # Sampling seed if sampling was activated
-    seed: Optional[int]
+    seed: Optional[int] = None
     # Decoder input tokens, empty if decoder_input_details is False
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
 
 
 # `generate` details
@@ -196,15 +407,15 @@ class Details(BaseModel):
     # Number of generated tokens
     generated_tokens: int
     # Sampling seed if sampling was activated
-    seed: Optional[int]
+    seed: Optional[int] = None
     # Decoder input tokens, empty if decoder_input_details is False
     prefill: List[InputToken]
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]]
+    top_tokens: Optional[List[List[Token]]] = None
     # Additional sequences when using the `best_of` parameter
-    best_of_sequences: Optional[List[BestOfSequence]]
+    best_of_sequences: Optional[List[BestOfSequence]] = None
 
 
 # `generate` return value
@@ -222,7 +433,7 @@ class StreamDetails(BaseModel):
     # Number of generated tokens
     generated_tokens: int
     # Sampling seed if sampling was activated
-    seed: Optional[int]
+    seed: Optional[int] = None
 
 
 # `generate_stream` return value
@@ -230,16 +441,20 @@ class StreamResponse(BaseModel):
     # Generated token
     token: Token
     # Most likely tokens
-    top_tokens: Optional[List[Token]]
+    top_tokens: Optional[List[Token]] = None
     # Complete generated text
     # Only available when the generation is finished
-    generated_text: Optional[str]
+    generated_text: Optional[str] = None
     # Generation details
     # Only available when the generation is finished
-    details: Optional[StreamDetails]
+    details: Optional[StreamDetails] = None
 
 
 # Inference API currently deployed model
 class DeployedModel(BaseModel):
+    # Disable warning for use of `model_` prefix in `model_id`. Be mindful about adding members
+    # with model_ prefixes, since this disables guardrails for colliding fields:
+    # https://github.com/pydantic/pydantic/issues/9177
+    model_config = ConfigDict(protected_namespaces=())
     model_id: str
     sha: str
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000..fb2ff198
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,10 @@
+Documentation available at: https://huggingface.co/docs/text-generation-inference
+
+## Release
+
+When making a release, please update the latest version in the documentation with:
+```
+export OLD_VERSION="2\.0\.3"
+export NEW_VERSION="2\.0\.4"
+find . -name '*.md' -exec sed -i -e "s/$OLD_VERSION/$NEW_VERSION/g" {} \;
+```
diff --git a/docs/index.html b/docs/index.html
index 16d143d8..f582d3ce 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -27,4 +27,4 @@
             }
         </script>
     </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/docs/openapi.json b/docs/openapi.json
index 5974c58d..9c9a8b1a 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.0.2"
+    "version": "2.1.2-dev0"
   },
   "paths": {
     "/": {
@@ -19,7 +19,6 @@
           "Text Generation Inference"
         ],
         "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
-        "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
         "operationId": "compat_generate",
         "requestBody": {
           "content": {
@@ -108,7 +107,6 @@
           "Text Generation Inference"
         ],
         "summary": "Generate tokens",
-        "description": "Generate tokens",
         "operationId": "generate",
         "requestBody": {
           "content": {
@@ -192,7 +190,6 @@
           "Text Generation Inference"
         ],
         "summary": "Generate a stream of token using Server-Sent Events",
-        "description": "Generate a stream of token using Server-Sent Events",
         "operationId": "generate_stream",
         "requestBody": {
           "content": {
@@ -276,7 +273,6 @@
           "Text Generation Inference"
         ],
         "summary": "Health check method",
-        "description": "Health check method",
         "operationId": "health",
         "responses": {
           "200": {
@@ -305,7 +301,6 @@
           "Text Generation Inference"
         ],
         "summary": "Text Generation Inference endpoint info",
-        "description": "Text Generation Inference endpoint info",
         "operationId": "get_model_info",
         "responses": {
           "200": {
@@ -327,7 +322,6 @@
           "Text Generation Inference"
         ],
         "summary": "Prometheus metrics scrape endpoint",
-        "description": "Prometheus metrics scrape endpoint",
         "operationId": "metrics",
         "responses": {
           "200": {
@@ -342,6 +336,226 @@
           }
         }
       }
+    },
+    "/tokenize": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Tokenize inputs",
+        "operationId": "tokenize",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/GenerateRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Tokenized ids",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/TokenizeResponse"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "No tokenizer found",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "No fast tokenizer available"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/chat/completions": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens",
+        "operationId": "chat_completions",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/ChatRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatCompletion"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatCompletionChunk"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/completions": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens",
+        "operationId": "completions",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/CompletionRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/Completion"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/CompletionCompleteChunk"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation"
+                }
+              }
+            }
+          }
+        }
+      }
     }
   },
   "components": {
@@ -367,7 +581,7 @@
             "type": "integer",
             "format": "int32",
             "example": 1,
-            "minimum": 0.0
+            "minimum": 0
           },
           "prefill": {
             "type": "array",
@@ -380,13 +594,387 @@
             "format": "int64",
             "example": 42,
             "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
           },
           "tokens": {
             "type": "array",
             "items": {
               "$ref": "#/components/schemas/Token"
             }
+          },
+          "top_tokens": {
+            "type": "array",
+            "items": {
+              "type": "array",
+              "items": {
+                "$ref": "#/components/schemas/Token"
+              }
+            }
+          }
+        }
+      },
+      "ChatCompletion": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "model",
+          "system_fingerprint",
+          "choices",
+          "usage"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionComplete"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": "1706270835",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          },
+          "usage": {
+            "$ref": "#/components/schemas/Usage"
+          }
+        }
+      },
+      "ChatCompletionChoice": {
+        "type": "object",
+        "required": [
+          "index",
+          "delta"
+        ],
+        "properties": {
+          "delta": {
+            "$ref": "#/components/schemas/ChatCompletionDelta"
+          },
+          "finish_reason": {
+            "type": "string",
+            "nullable": true
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "logprobs": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ChatCompletionLogprobs"
+              }
+            ],
+            "nullable": true
+          }
+        }
+      },
+      "ChatCompletionChunk": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "model",
+          "system_fingerprint",
+          "choices"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionChoice"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": "1706270978",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          }
+        }
+      },
+      "ChatCompletionComplete": {
+        "type": "object",
+        "required": [
+          "index",
+          "message",
+          "finish_reason"
+        ],
+        "properties": {
+          "finish_reason": {
+            "type": "string"
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "logprobs": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ChatCompletionLogprobs"
+              }
+            ],
+            "nullable": true
+          },
+          "message": {
+            "$ref": "#/components/schemas/OutputMessage"
+          }
+        }
+      },
+      "ChatCompletionDelta": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/TextMessage"
+          },
+          {
+            "$ref": "#/components/schemas/ToolCallDelta"
+          }
+        ]
+      },
+      "ChatCompletionLogprob": {
+        "type": "object",
+        "required": [
+          "token",
+          "logprob",
+          "top_logprobs"
+        ],
+        "properties": {
+          "logprob": {
+            "type": "number",
+            "format": "float"
+          },
+          "token": {
+            "type": "string"
+          },
+          "top_logprobs": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionTopLogprob"
+            }
+          }
+        }
+      },
+      "ChatCompletionLogprobs": {
+        "type": "object",
+        "required": [
+          "content"
+        ],
+        "properties": {
+          "content": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/ChatCompletionLogprob"
+            }
+          }
+        }
+      },
+      "ChatCompletionTopLogprob": {
+        "type": "object",
+        "required": [
+          "token",
+          "logprob"
+        ],
+        "properties": {
+          "logprob": {
+            "type": "number",
+            "format": "float"
+          },
+          "token": {
+            "type": "string"
+          }
+        }
+      },
+      "ChatRequest": {
+        "type": "object",
+        "required": [
+          "model",
+          "messages"
+        ],
+        "properties": {
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
+            "example": "1.0",
+            "nullable": true
+          },
+          "logit_bias": {
+            "type": "array",
+            "items": {
+              "type": "number",
+              "format": "float"
+            },
+            "description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
+            "nullable": true
+          },
+          "logprobs": {
+            "type": "boolean",
+            "description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.",
+            "example": "false",
+            "nullable": true
+          },
+          "max_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The maximum number of tokens that can be generated in the chat completion.",
+            "example": "32",
+            "nullable": true,
+            "minimum": 0
+          },
+          "messages": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Message"
+            },
+            "description": "A list of messages comprising the conversation so far.",
+            "example": "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]"
+          },
+          "model": {
+            "type": "string",
+            "description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "n": {
+            "type": "integer",
+            "format": "int32",
+            "description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
+            "example": "2",
+            "nullable": true,
+            "minimum": 0
+          },
+          "presence_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
+            "example": 0.1,
+            "nullable": true
+          },
+          "response_format": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/GrammarType"
+              }
+            ],
+            "default": "null",
+            "nullable": true
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          },
+          "stop": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "Up to 4 sequences where the API will stop generating further tokens.",
+            "example": "null",
+            "nullable": true
+          },
+          "stream": {
+            "type": "boolean"
+          },
+          "temperature": {
+            "type": "number",
+            "format": "float",
+            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.",
+            "example": 1.0,
+            "nullable": true
+          },
+          "tool_choice": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/ToolType"
+              }
+            ],
+            "nullable": true
+          },
+          "tool_prompt": {
+            "type": "string",
+            "description": "A prompt to be appended before the tools",
+            "example": "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"",
+            "nullable": true
+          },
+          "tools": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Tool"
+            },
+            "description": "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for.",
+            "example": "null",
+            "nullable": true
+          },
+          "top_logprobs": {
+            "type": "integer",
+            "format": "int32",
+            "description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
+            "example": "5",
+            "nullable": true,
+            "minimum": 0
+          },
+          "top_p": {
+            "type": "number",
+            "format": "float",
+            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
+            "example": 0.95,
+            "nullable": true
+          }
+        }
+      },
+      "Chunk": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "choices",
+          "model",
+          "system_fingerprint"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/CompletionComplete"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string"
+          },
+          "system_fingerprint": {
+            "type": "string"
           }
         }
       },
@@ -409,6 +997,222 @@
           }
         }
       },
+      "Completion": {
+        "oneOf": [
+          {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Chunk"
+              },
+              {
+                "type": "object",
+                "required": [
+                  "object"
+                ],
+                "properties": {
+                  "object": {
+                    "type": "string",
+                    "enum": [
+                      "text_completion"
+                    ]
+                  }
+                }
+              }
+            ]
+          },
+          {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/CompletionFinal"
+              },
+              {
+                "type": "object",
+                "required": [
+                  "object"
+                ],
+                "properties": {
+                  "object": {
+                    "type": "string",
+                    "enum": [
+                      "text_completion"
+                    ]
+                  }
+                }
+              }
+            ]
+          }
+        ],
+        "discriminator": {
+          "propertyName": "object"
+        }
+      },
+      "CompletionComplete": {
+        "type": "object",
+        "required": [
+          "index",
+          "text",
+          "finish_reason"
+        ],
+        "properties": {
+          "finish_reason": {
+            "type": "string"
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "logprobs": {
+            "type": "array",
+            "items": {
+              "type": "number",
+              "format": "float"
+            },
+            "nullable": true
+          },
+          "text": {
+            "type": "string"
+          }
+        }
+      },
+      "CompletionFinal": {
+        "type": "object",
+        "required": [
+          "id",
+          "created",
+          "model",
+          "system_fingerprint",
+          "choices",
+          "usage"
+        ],
+        "properties": {
+          "choices": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/CompletionComplete"
+            }
+          },
+          "created": {
+            "type": "integer",
+            "format": "int64",
+            "example": "1706270835",
+            "minimum": 0
+          },
+          "id": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "system_fingerprint": {
+            "type": "string"
+          },
+          "usage": {
+            "$ref": "#/components/schemas/Usage"
+          }
+        }
+      },
+      "CompletionRequest": {
+        "type": "object",
+        "required": [
+          "model",
+          "prompt"
+        ],
+        "properties": {
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
+            "example": "1.0",
+            "nullable": true
+          },
+          "max_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The maximum number of tokens that can be generated in the chat completion.",
+            "default": "32",
+            "nullable": true,
+            "minimum": 0
+          },
+          "model": {
+            "type": "string",
+            "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
+            "example": "mistralai/Mistral-7B-Instruct-v0.2"
+          },
+          "prompt": {
+            "$ref": "#/components/schemas/Prompt"
+          },
+          "repetition_penalty": {
+            "type": "number",
+            "format": "float",
+            "nullable": true
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true,
+            "minimum": 0
+          },
+          "stop": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "Up to 4 sequences where the API will stop generating further tokens.",
+            "example": "null",
+            "nullable": true
+          },
+          "stream": {
+            "type": "boolean"
+          },
+          "suffix": {
+            "type": "string",
+            "description": "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template.",
+            "nullable": true
+          },
+          "temperature": {
+            "type": "number",
+            "format": "float",
+            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.",
+            "example": 1.0,
+            "nullable": true
+          },
+          "top_p": {
+            "type": "number",
+            "format": "float",
+            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
+            "example": 0.95,
+            "nullable": true
+          }
+        }
+      },
+      "DeltaToolCall": {
+        "type": "object",
+        "required": [
+          "index",
+          "id",
+          "type",
+          "function"
+        ],
+        "properties": {
+          "function": {
+            "$ref": "#/components/schemas/Function"
+          },
+          "id": {
+            "type": "string"
+          },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "type": {
+            "type": "string"
+          }
+        }
+      },
       "Details": {
         "type": "object",
         "required": [
@@ -432,7 +1236,7 @@
             "type": "integer",
             "format": "int32",
             "example": 1,
-            "minimum": 0.0
+            "minimum": 0
           },
           "prefill": {
             "type": "array",
@@ -445,13 +1249,22 @@
             "format": "int64",
             "example": 42,
             "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
           },
           "tokens": {
             "type": "array",
             "items": {
               "$ref": "#/components/schemas/Token"
             }
+          },
+          "top_tokens": {
+            "type": "array",
+            "items": {
+              "type": "array",
+              "items": {
+                "$ref": "#/components/schemas/Token"
+              }
+            }
           }
         }
       },
@@ -476,50 +1289,115 @@
           "length",
           "eos_token",
           "stop_sequence"
-        ]
+        ],
+        "example": "Length"
+      },
+      "Function": {
+        "type": "object",
+        "required": [
+          "arguments"
+        ],
+        "properties": {
+          "arguments": {
+            "type": "string"
+          },
+          "name": {
+            "type": "string",
+            "nullable": true
+          }
+        }
+      },
+      "FunctionDefinition": {
+        "type": "object",
+        "required": [
+          "name",
+          "arguments"
+        ],
+        "properties": {
+          "arguments": {},
+          "description": {
+            "type": "string",
+            "nullable": true
+          },
+          "name": {
+            "type": "string"
+          }
+        }
       },
       "GenerateParameters": {
         "type": "object",
         "properties": {
+          "adapter_id": {
+            "type": "string",
+            "description": "Lora adapter id",
+            "default": "null",
+            "example": "null",
+            "nullable": true
+          },
           "best_of": {
             "type": "integer",
+            "description": "Generate best_of sequences and return the one if the highest token logprobs.",
             "default": "null",
             "example": 1,
             "nullable": true,
-            "minimum": 0.0,
-            "exclusiveMinimum": 0.0
+            "minimum": 0,
+            "exclusiveMinimum": 0
           },
           "decoder_input_details": {
             "type": "boolean",
-            "default": "true"
+            "description": "Whether to return decoder input token logprobs and ids.",
+            "default": "false"
           },
           "details": {
             "type": "boolean",
+            "description": "Whether to return generation details.",
             "default": "true"
           },
           "do_sample": {
             "type": "boolean",
+            "description": "Activate logits sampling.",
             "default": "false",
             "example": true
           },
+          "frequency_penalty": {
+            "type": "number",
+            "format": "float",
+            "description": "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
+            "default": "null",
+            "example": 0.1,
+            "nullable": true,
+            "exclusiveMinimum": -2
+          },
+          "grammar": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/GrammarType"
+              }
+            ],
+            "default": "null",
+            "nullable": true
+          },
           "max_new_tokens": {
             "type": "integer",
             "format": "int32",
-            "default": "20",
-            "minimum": 0.0,
-            "exclusiveMaximum": 512.0,
-            "exclusiveMinimum": 0.0
+            "description": "Maximum number of tokens to generate.",
+            "default": "100",
+            "example": "20",
+            "nullable": true,
+            "minimum": 0
           },
           "repetition_penalty": {
             "type": "number",
             "format": "float",
+            "description": "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.",
             "default": "null",
             "example": 1.03,
             "nullable": true,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
           },
           "return_full_text": {
             "type": "boolean",
+            "description": "Whether to prepend the prompt to the generated text",
             "default": "null",
             "example": false,
             "nullable": true
@@ -527,17 +1405,19 @@
           "seed": {
             "type": "integer",
             "format": "int64",
+            "description": "Random sampling seed.",
             "default": "null",
             "example": "null",
             "nullable": true,
-            "minimum": 0.0,
-            "exclusiveMinimum": 0.0
+            "minimum": 0,
+            "exclusiveMinimum": 0
           },
           "stop": {
             "type": "array",
             "items": {
               "type": "string"
             },
+            "description": "Stop generating tokens if a member of `stop` is generated.",
             "example": [
               "photographer"
             ],
@@ -546,46 +1426,62 @@
           "temperature": {
             "type": "number",
             "format": "float",
+            "description": "The value used to module the logits distribution.",
             "default": "null",
             "example": 0.5,
             "nullable": true,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
           },
           "top_k": {
             "type": "integer",
             "format": "int32",
+            "description": "The number of highest probability vocabulary tokens to keep for top-k-filtering.",
             "default": "null",
             "example": 10,
             "nullable": true,
-            "exclusiveMinimum": 0.0
+            "exclusiveMinimum": 0
+          },
+          "top_n_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "description": "The number of highest probability vocabulary tokens to keep for top-n-filtering.",
+            "default": "null",
+            "example": 5,
+            "nullable": true,
+            "minimum": 0,
+            "exclusiveMinimum": 0
           },
           "top_p": {
             "type": "number",
             "format": "float",
+            "description": "Top-p value for nucleus sampling.",
             "default": "null",
             "example": 0.95,
             "nullable": true,
-            "maximum": 1.0,
-            "exclusiveMinimum": 0.0
+            "maximum": 1,
+            "exclusiveMinimum": 0
           },
           "truncate": {
             "type": "integer",
+            "description": "Truncate inputs tokens to the given size.",
             "default": "null",
             "example": "null",
             "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
           },
           "typical_p": {
             "type": "number",
             "format": "float",
+            "description": "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.",
             "default": "null",
             "example": 0.95,
             "nullable": true,
-            "maximum": 1.0,
-            "exclusiveMinimum": 0.0
+            "maximum": 1,
+            "exclusiveMinimum": 0
           },
           "watermark": {
             "type": "boolean",
+            "description": "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).",
             "default": "false",
             "example": true
           }
@@ -626,6 +1522,49 @@
           }
         }
       },
+      "GrammarType": {
+        "oneOf": [
+          {
+            "type": "object",
+            "required": [
+              "type",
+              "value"
+            ],
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "json"
+                ]
+              },
+              "value": {
+                "description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
+              }
+            }
+          },
+          {
+            "type": "object",
+            "required": [
+              "type",
+              "value"
+            ],
+            "properties": {
+              "type": {
+                "type": "string",
+                "enum": [
+                  "regex"
+                ]
+              },
+              "value": {
+                "type": "string"
+              }
+            }
+          }
+        ],
+        "discriminator": {
+          "propertyName": "type"
+        }
+      },
       "Info": {
         "type": "object",
         "required": [
@@ -635,12 +1574,14 @@
           "max_concurrent_requests",
           "max_best_of",
           "max_stop_sequences",
-          "max_input_length",
+          "max_input_tokens",
           "max_total_tokens",
           "waiting_served_ratio",
           "max_batch_total_tokens",
           "max_waiting_tokens",
           "validation_workers",
+          "max_client_batch_size",
+          "router",
           "version"
         ],
         "properties": {
@@ -649,42 +1590,53 @@
             "example": "null",
             "nullable": true
           },
+          "max_batch_size": {
+            "type": "integer",
+            "example": "null",
+            "nullable": true,
+            "minimum": 0
+          },
           "max_batch_total_tokens": {
             "type": "integer",
             "format": "int32",
             "example": "32000",
-            "minimum": 0.0
+            "minimum": 0
           },
           "max_best_of": {
             "type": "integer",
             "example": "2",
-            "minimum": 0.0
+            "minimum": 0
+          },
+          "max_client_batch_size": {
+            "type": "integer",
+            "example": "32",
+            "minimum": 0
           },
           "max_concurrent_requests": {
             "type": "integer",
             "description": "Router Parameters",
             "example": "128",
-            "minimum": 0.0
+            "minimum": 0
           },
-          "max_input_length": {
+          "max_input_tokens": {
             "type": "integer",
             "example": "1024",
-            "minimum": 0.0
+            "minimum": 0
           },
           "max_stop_sequences": {
             "type": "integer",
             "example": "4",
-            "minimum": 0.0
+            "minimum": 0
           },
           "max_total_tokens": {
             "type": "integer",
             "example": "2048",
-            "minimum": 0.0
+            "minimum": 0
           },
           "max_waiting_tokens": {
             "type": "integer",
             "example": "20",
-            "minimum": 0.0
+            "minimum": 0
           },
           "model_device_type": {
             "type": "string",
@@ -709,6 +1661,11 @@
             "example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
             "nullable": true
           },
+          "router": {
+            "type": "string",
+            "description": "Router Info",
+            "example": "text-generation-router"
+          },
           "sha": {
             "type": "string",
             "example": "null",
@@ -717,11 +1674,10 @@
           "validation_workers": {
             "type": "integer",
             "example": "2",
-            "minimum": 0.0
+            "minimum": 0
           },
           "version": {
             "type": "string",
-            "description": "Router Info",
             "example": "0.5.0"
           },
           "waiting_served_ratio": {
@@ -731,6 +1687,27 @@
           }
         }
       },
+      "Message": {
+        "type": "object",
+        "required": [
+          "role",
+          "content"
+        ],
+        "properties": {
+          "content": {
+            "$ref": "#/components/schemas/MessageContent"
+          },
+          "name": {
+            "type": "string",
+            "example": "\"David\"",
+            "nullable": true
+          },
+          "role": {
+            "type": "string",
+            "example": "user"
+          }
+        }
+      },
       "PrefillToken": {
         "type": "object",
         "required": [
@@ -743,7 +1720,7 @@
             "type": "integer",
             "format": "int32",
             "example": 0,
-            "minimum": 0.0
+            "minimum": 0
           },
           "logprob": {
             "type": "number",
@@ -757,6 +1734,43 @@
           }
         }
       },
+      "Prompt": {
+        "type": "array",
+        "items": {
+          "type": "string"
+        }
+      },
+      "SimpleToken": {
+        "type": "object",
+        "required": [
+          "id",
+          "text",
+          "start",
+          "stop"
+        ],
+        "properties": {
+          "id": {
+            "type": "integer",
+            "format": "int32",
+            "example": 0,
+            "minimum": 0
+          },
+          "start": {
+            "type": "integer",
+            "example": 0,
+            "minimum": 0
+          },
+          "stop": {
+            "type": "integer",
+            "example": 2,
+            "minimum": 0
+          },
+          "text": {
+            "type": "string",
+            "example": "test"
+          }
+        }
+      },
       "StreamDetails": {
         "type": "object",
         "required": [
@@ -771,20 +1785,21 @@
             "type": "integer",
             "format": "int32",
             "example": 1,
-            "minimum": 0.0
+            "minimum": 0
           },
           "seed": {
             "type": "integer",
             "format": "int64",
             "example": 42,
             "nullable": true,
-            "minimum": 0.0
+            "minimum": 0
           }
         }
       },
       "StreamResponse": {
         "type": "object",
         "required": [
+          "index",
           "token"
         ],
         "properties": {
@@ -794,6 +1809,7 @@
                 "$ref": "#/components/schemas/StreamDetails"
               }
             ],
+            "default": "null",
             "nullable": true
           },
           "generated_text": {
@@ -802,8 +1818,19 @@
             "example": "test",
             "nullable": true
           },
+          "index": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
           "token": {
             "$ref": "#/components/schemas/Token"
+          },
+          "top_tokens": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Token"
+            }
           }
         }
       },
@@ -820,7 +1847,7 @@
             "type": "integer",
             "format": "int32",
             "example": 0,
-            "minimum": 0.0
+            "minimum": 0
           },
           "logprob": {
             "type": "number",
@@ -837,6 +1864,95 @@
             "example": "test"
           }
         }
+      },
+      "TokenizeResponse": {
+        "type": "array",
+        "items": {
+          "$ref": "#/components/schemas/SimpleToken"
+        }
+      },
+      "Tool": {
+        "type": "object",
+        "required": [
+          "type",
+          "function"
+        ],
+        "properties": {
+          "function": {
+            "$ref": "#/components/schemas/FunctionDefinition"
+          },
+          "type": {
+            "type": "string",
+            "example": "function"
+          }
+        }
+      },
+      "ToolCall": {
+        "type": "object",
+        "required": [
+          "id",
+          "type",
+          "function"
+        ],
+        "properties": {
+          "function": {
+            "$ref": "#/components/schemas/FunctionDefinition"
+          },
+          "id": {
+            "type": "string"
+          },
+          "type": {
+            "type": "string"
+          }
+        }
+      },
+      "ToolType": {
+        "oneOf": [
+          {
+            "type": "object",
+            "default": null,
+            "nullable": true
+          },
+          {
+            "type": "string"
+          },
+          {
+            "type": "object",
+            "required": [
+              "function"
+            ],
+            "properties": {
+              "function": {
+                "$ref": "#/components/schemas/FunctionName"
+              }
+            }
+          }
+        ]
+      },
+      "Usage": {
+        "type": "object",
+        "required": [
+          "prompt_tokens",
+          "completion_tokens",
+          "total_tokens"
+        ],
+        "properties": {
+          "completion_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "prompt_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          },
+          "total_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "minimum": 0
+          }
+        }
       }
     }
   },
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 5ba470bd..c9b4efd9 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -3,10 +3,22 @@
     title: Text Generation Inference
   - local: quicktour
     title: Quick Tour
+  - local: installation_nvidia
+    title: Using TGI with Nvidia GPUs
+  - local: installation_amd
+    title: Using TGI with AMD GPUs
+  - local: installation_gaudi
+    title: Using TGI with Intel Gaudi
+  - local: installation_inferentia
+    title: Using TGI with AWS Inferentia
   - local: installation
-    title: Installation
+    title: Installation from source
   - local: supported_models
     title: Supported Models and Hardware
+  - local: messages_api
+    title: Messages API
+  - local: architecture
+    title: Internal Architecture
   title: Getting started
 - sections:
   - local: basic_tutorials/consuming_tgi
@@ -17,8 +29,40 @@
     title: Serving Private & Gated Models
   - local: basic_tutorials/using_cli
     title: Using TGI CLI
+  - local: basic_tutorials/launcher
+    title: All TGI CLI options
+  - local: basic_tutorials/non_core_models
+    title: Non-core Model Serving
+  - local: basic_tutorials/safety
+    title: Safety
+  - local: basic_tutorials/using_guidance
+    title: Using Guidance, JSON, tools
+  - local: basic_tutorials/visual_language_models
+    title: Visual Language Models
+  - local: basic_tutorials/monitoring
+    title: Monitoring TGI with Prometheus and Grafana
+  - local: basic_tutorials/train_medusa
+    title: Train Medusa
   title: Tutorials
 - sections:
   - local: conceptual/streaming
     title: Streaming
+  - local: conceptual/quantization
+    title: Quantization
+  - local: conceptual/tensor_parallelism
+    title: Tensor Parallelism
+  - local: conceptual/paged_attention
+    title: PagedAttention
+  - local: conceptual/safetensors
+    title: Safetensors
+  - local: conceptual/flash_attention
+    title: Flash Attention
+  - local: conceptual/speculation
+    title: Speculation (Medusa, ngram)
+  - local: conceptual/guidance
+    title: How Guidance Works (via outlines
+  - local: conceptual/lora
+    title: LoRA (Low-Rank Adaptation)
+
+
   title: Conceptual Guides
diff --git a/docs/source/architecture.md b/docs/source/architecture.md
new file mode 100644
index 00000000..a8418817
--- /dev/null
+++ b/docs/source/architecture.md
@@ -0,0 +1,231 @@
+# Text Generation Inference Architecture
+
+This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.
+
+A high-level architecture diagram can be seen here:
+
+![TGI architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png)
+
+This diagram shows well there are these separate components:
+
+- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
+- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
+- **The launcher** is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
+
+The router and the model server can be two different machines, they do not need to be deployed together.
+
+## The Router
+
+This component is a rust web server binary that accepts HTTP requests using the custom [HTTP API](https://huggingface.github.io/text-generation-inference/), as well as OpenAI's [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api).
+The router receives the API calls and handles the "baches" logic (and introduction to batching can be found [here](https://github.com/huggingface/text-generation-inference/blob/main/router/README.md)).
+It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.
+
+### Router's command line
+
+The router command line will be the way to pass parameters to it (it does not rely on configuration file):
+
+```
+Text Generation Webserver
+
+Usage: text-generation-router [OPTIONS]
+
+Options:
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          [env: MAX_CONCURRENT_REQUESTS=] [default: 128]
+      --max-best-of <MAX_BEST_OF>
+          [env: MAX_BEST_OF=] [default: 2]
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          [env: MAX_STOP_SEQUENCES=] [default: 4]
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          [env: MAX_TOP_N_TOKENS=] [default: 5]
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          [env: MAX_INPUT_TOKENS=] [default: 1024]
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          [env: MAX_TOTAL_TOKENS=] [default: 2048]
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          [env: WAITING_SERVED_RATIO=] [default: 1.2]
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          [env: MAX_BATCH_PREFILL_TOKENS=] [default: 4096]
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          [env: MAX_WAITING_TOKENS=] [default: 20]
+      --max-batch-size <MAX_BATCH_SIZE>
+          [env: MAX_BATCH_SIZE=]
+      --hostname <HOSTNAME>
+          [env: HOSTNAME=] [default: 0.0.0.0]
+  -p, --port <PORT>
+          [env: PORT=] [default: 3000]
+      --master-shard-uds-path <MASTER_SHARD_UDS_PATH>
+          [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]
+      --tokenizer-name <TOKENIZER_NAME>
+          [env: TOKENIZER_NAME=] [default: bigscience/bloom]
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          [env: TOKENIZER_CONFIG_PATH=]
+      --revision <REVISION>
+          [env: REVISION=]
+      --validation-workers <VALIDATION_WORKERS>
+          [env: VALIDATION_WORKERS=] [default: 2]
+      --json-output
+          [env: JSON_OUTPUT=]
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+      --ngrok
+          [env: NGROK=]
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          [env: NGROK_AUTHTOKEN=]
+      --ngrok-edge <NGROK_EDGE>
+          [env: NGROK_EDGE=]
+      --messages-api-enabled
+          [env: MESSAGES_API_ENABLED=]
+      --disable-grammar-support
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          [env: MAX_CLIENT_BATCH_SIZE=] [default: 4]
+  -h, --help
+          Print help
+  -V, --version
+          Print version
+```
+
+## The Model Server
+
+The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide [tensor parallelism](https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism), and stays alive while waiting for new requests.
+The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.
+
+### Model Server Variants
+
+Several variants of the model server exist that are actively supported by Hugging Face:
+
+- By default, the model server will attempt building [a server optimized for Nvidia GPUs with CUDA](https://huggingface.co/docs/text-generation-inference/installation_nvidia). The code for this version is hosted in the [main TGI repository](https://github.com/huggingface/text-generation-inference).
+- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
+- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
+- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
+- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
+
+Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
+
+### Command Line Interface
+
+The official command line interface (CLI) for the server supports three subcommands, `download-weights`, `quantize` and `serve`:
+
+- `download-weights` will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;
+- `quantize` will allow to quantize a model using the `qptq` package. This feature is not available nor supported on all variants;
+- `serve` will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.
+
+Serve's command line parameters on the TGI repository are these:
+
+```
+ Usage: cli.py serve [OPTIONS] MODEL_ID
+
+╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    model_id      TEXT  [default: None] [required]                                                      │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --revision                                       TEXT                        [default: None]             │
+│ --sharded              --no-sharded                                          [default: no-sharded]       │
+│ --quantize                                       [bitsandbytes|bitsandbytes  [default: None]             │
+│                                                  -nf4|bitsandbytes-fp4|gptq                              │
+│                                                  |awq|eetq|exl2|fp8]                                     │
+│ --speculate                                      INTEGER                     [default: None]             │
+│ --dtype                                          [float16|bfloat16]          [default: None]             │
+│ --trust-remote-code    --no-trust-remote-code                                [default:                   │
+│                                                                              no-trust-remote-code]       │
+│ --uds-path                                       PATH                        [default:                   │
+│                                                                              /tmp/text-generation-serve… │
+│ --logger-level                                   TEXT                        [default: INFO]             │
+│ --json-output          --no-json-output                                      [default: no-json-output]   │
+│ --otlp-endpoint                                  TEXT                        [default: None]             │
+│ --otlp-service-name                              TEXT                        [default:                   │
+│                                                                              text-generation-inference...│
+│ --help                                                                       Show this message and exit. │
+╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.
+
+## Call Flow
+
+Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, [v2](https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto) and [v3](https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto). These two versions are almost identical, except for:
+
+- input chunks support, for text and image data,
+- paged attention support
+
+Here's a diagram that displays the exchanges that follow the router and model server startup.
+
+```mermaid
+sequenceDiagram
+
+    Router->>Model Server: service discovery
+    Model Server-->>Router: urls for other shards
+
+    Router->>Model Server: get model info
+    Model Server-->>Router: shard info
+
+    Router->>Model Server: health check
+    Model Server-->>Router: health OK
+
+    Router->>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
+    Model Server-->>Router: warmup result
+```
+
+After these are done, the router is ready to receive generate calls from multiple clients. Here's an example.
+
+```mermaid
+sequenceDiagram
+    participant Client 1
+    participant Client 2
+    participant Client 3
+    participant Router
+    participant Model Server
+
+    Client 1->>Router: generate_stream
+    Router->>Model Server: prefill(batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 1
+
+    Router->>Model Server: decode(cached_batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 2
+
+    Router->>Model Server: decode(cached_batch1)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 3
+
+    Client 2->>Router: generate_stream
+    Router->>Model Server: prefill(batch2)
+    Note right of Model Server: This stops previous batch, that is restarted
+    Model Server-->>Router: generations, cached_batch2, timings
+    Router-->>Client 2: token 1'
+
+    Router->>Model Server: decode(cached_batch1, cached_batch2)
+    Model Server-->>Router: generations, cached_batch1, timings
+    Router-->>Client 1: token 4
+    Router-->>Client 2: token 2'
+
+    Note left of Client 1: Client 1 leaves
+    Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
+    Model Server-->>Router: filtered batch
+
+    Router->>Model Server: decode(cached_batch2)
+    Model Server-->>Router: generations, cached_batch2, timings
+    Router-->>Client 2: token 3'
+
+    Client 3->>Router: generate_stream
+    Note right of Model Server: This stops previous batch, that is restarted
+    Router->>Model Server: prefill(batch3)
+    Note left of Client 1: Client 3 leaves without receiving any batch
+    Router->>Model Server: clear_cache(batch3)
+    Note right of Model Server: This stops previous batch, that is restarted
+
+    Router->>Model Server: decode(cached_batch3)
+    Note right of Model Server: Last token (stopping criteria)
+    Model Server-->>Router: generations, cached_batch3, timings
+    Router-->>Client 2: token 4'
+
+
+```
diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md
index 540f4b13..4829ec7c 100644
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@@ -23,7 +23,7 @@ You can simply install `huggingface-hub` package with pip.
 pip install huggingface-hub
 ```
 
-Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. 
+Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
 
 ```python
 from huggingface_hub import InferenceClient
@@ -83,8 +83,8 @@ Gradio is a Python library that helps you build web applications for your machin
 pip install huggingface-hub gradio
 ```
 
-Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). 
- 
+Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
+
 ```python
 import gradio as gr
 from huggingface_hub import InferenceClient
@@ -110,30 +110,30 @@ gr.ChatInterface(
 ).queue().launch()
 ```
 
-The UI looks like this 👇 
+The UI looks like this 👇
 
 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
     />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
     />
 </div>
 
-You can try the demo directly here 👇 
+You can try the demo directly here 👇
 
 <div class="block dark:hidden">
-	<iframe 
+	<iframe
         src="https://merve-gradio-tgi-2.hf.space?__theme=light"
         width="850"
         height="750"
     ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
         src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
         width="850"
         height="750"
@@ -152,4 +152,4 @@ You can read more about how to customize a `ChatInterface` [here](https://www.gr
 
 ## API documentation
 
-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference). 
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
diff --git a/docs/source/basic_tutorials/gated_model_access.md b/docs/source/basic_tutorials/gated_model_access.md
index 08c76de2..ef3a1db7 100644
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@@ -2,13 +2,13 @@
 
 If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
 
-If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
+If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
 
 ```
-export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
+export HF_TOKEN=<YOUR READ TOKEN>
 ```
 
-If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
+If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
 
 ```bash
 model=meta-llama/Llama-2-7b-chat-hf
@@ -17,8 +17,8 @@ token=<your READ token>
 
 docker run --gpus all \
     --shm-size 1g \
-    -e HUGGING_FACE_HUB_TOKEN=$token \
+    -e HF_TOKEN=$token \
     -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.1 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
     --model-id $model
 ```
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md
new file mode 100644
index 00000000..5e40146f
--- /dev/null
+++ b/docs/source/basic_tutorials/launcher.md
@@ -0,0 +1,439 @@
+# Text-generation-launcher arguments
+
+<!-- WRAP CODE BLOCKS -->
+
+```shell
+Text Generation Launcher
+
+Usage: text-generation-launcher [OPTIONS]
+
+Options:
+```
+## MODEL_ID
+```shell
+      --model-id <MODEL_ID>
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`. Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of transformers
+          
+          [env: MODEL_ID=]
+          [default: bigscience/bloom-560m]
+
+```
+## REVISION
+```shell
+      --revision <REVISION>
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
+          
+          [env: REVISION=]
+
+```
+## VALIDATION_WORKERS
+```shell
+      --validation-workers <VALIDATION_WORKERS>
+          The number of tokenizer workers used for payload validation and truncation inside the router
+          
+          [env: VALIDATION_WORKERS=]
+          [default: 2]
+
+```
+## SHARDED
+```shell
+      --sharded <SHARDED>
+          Whether to shard the model across multiple GPUs By default text-generation-inference will use all available GPUs to run the model. Setting it to `false` deactivates `num_shard`
+          
+          [env: SHARDED=]
+          [possible values: true, false]
+
+```
+## NUM_SHARD
+```shell
+      --num-shard <NUM_SHARD>
+          The number of shards to use if you don't want to use all GPUs on a given machine. You can use `CUDA_VISIBLE_DEVICES=0,1 text-generation-launcher... --num_shard 2` and `CUDA_VISIBLE_DEVICES=2,3 text-generation-launcher... --num_shard 2` to launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance
+          
+          [env: NUM_SHARD=]
+
+```
+## QUANTIZE
+```shell
+      --quantize <QUANTIZE>
+          Whether you want the model to be quantized
+          
+          [env: QUANTIZE=]
+
+          Possible values:
+          - awq:              4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+          - exl2:             Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - marlin:           4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>
+          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
+          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
+          - fp8:              [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
+
+```
+## SPECULATE
+```shell
+      --speculate <SPECULATE>
+          The number of input_ids to speculate on If using a medusa model, the heads will be picked up automatically Other wise, it will use n-gram speculation which is relatively free in terms of compute, but the speedup heavily depends on the task
+          
+          [env: SPECULATE=]
+
+```
+## DTYPE
+```shell
+      --dtype <DTYPE>
+          The dtype to be forced upon the model. This option cannot be used with `--quantize`
+          
+          [env: DTYPE=]
+          [possible values: float16, bfloat16]
+
+```
+## TRUST_REMOTE_CODE
+```shell
+      --trust-remote-code
+          Whether you want to execute hub modelling code. Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision
+          
+          [env: TRUST_REMOTE_CODE=]
+
+```
+## MAX_CONCURRENT_REQUESTS
+```shell
+      --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
+          The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
+          
+          [env: MAX_CONCURRENT_REQUESTS=]
+          [default: 128]
+
+```
+## MAX_BEST_OF
+```shell
+      --max-best-of <MAX_BEST_OF>
+          This is the maximum allowed value for clients to set `best_of`. Best of makes `n` generations at the same time, and return the best in terms of overall log probability over the entire generated sequence
+          
+          [env: MAX_BEST_OF=]
+          [default: 2]
+
+```
+## MAX_STOP_SEQUENCES
+```shell
+      --max-stop-sequences <MAX_STOP_SEQUENCES>
+          This is the maximum allowed value for clients to set `stop_sequences`. Stop sequences are used to allow the model to stop on more than just the EOS token, and enable more complex "prompting" where users can preprompt the model in a specific way and define their "own" stop token aligned with their prompt
+          
+          [env: MAX_STOP_SEQUENCES=]
+          [default: 4]
+
+```
+## MAX_TOP_N_TOKENS
+```shell
+      --max-top-n-tokens <MAX_TOP_N_TOKENS>
+          This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
+          
+          [env: MAX_TOP_N_TOKENS=]
+          [default: 5]
+
+```
+## MAX_INPUT_TOKENS
+```shell
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
+          
+          [env: MAX_INPUT_TOKENS=]
+
+```
+## MAX_INPUT_LENGTH
+```shell
+      --max-input-length <MAX_INPUT_LENGTH>
+          Legacy version of [`Args::max_input_tokens`]
+          
+          [env: MAX_INPUT_LENGTH=]
+
+```
+## MAX_TOTAL_TOKENS
+```shell
+      --max-total-tokens <MAX_TOTAL_TOKENS>
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
+          
+          [env: MAX_TOTAL_TOKENS=]
+
+```
+## WAITING_SERVED_RATIO
+```shell
+      --waiting-served-ratio <WAITING_SERVED_RATIO>
+          This represents the ratio of waiting queries vs running queries where you want to start considering pausing the running queries to include the waiting ones into the same batch. `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's only 10 queries left in the current batch we check if we can fit those 12 waiting queries into the batching strategy, and if yes, then batching happens delaying the 10 running queries by a `prefill` run.
+          
+          This setting is only applied if there is room in the batch as defined by `max_batch_total_tokens`.
+          
+          [env: WAITING_SERVED_RATIO=]
+          [default: 0.3]
+
+```
+## MAX_BATCH_PREFILL_TOKENS
+```shell
+      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
+          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to `max_input_tokens + 50` to give a bit of room
+          
+          [env: MAX_BATCH_PREFILL_TOKENS=]
+
+```
+## MAX_BATCH_TOTAL_TOKENS
+```shell
+      --max-batch-total-tokens <MAX_BATCH_TOTAL_TOKENS>
+          **IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
+          
+          This represents the total amount of potential tokens within a batch. When using padding (not recommended) this would be equivalent of `batch_size` * `max_total_tokens`.
+          
+          However in the non-padded (flash attention) version this can be much finer.
+          
+          For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
+          
+          Overall this number should be the largest possible amount that fits the remaining memory (after the model is loaded). Since the actual memory overhead depends on other parameters like if you're using quantization, flash attention or the model implementation, text-generation-inference cannot infer this number automatically.
+          
+          [env: MAX_BATCH_TOTAL_TOKENS=]
+
+```
+## MAX_WAITING_TOKENS
+```shell
+      --max-waiting-tokens <MAX_WAITING_TOKENS>
+          This setting defines how many tokens can be passed before forcing the waiting queries to be put on the batch (if the size of the batch allows for it). New queries require 1 `prefill` forward, which is different from `decode` and therefore you need to pause the running batch in order to run `prefill` to create the correct values for the waiting queries to be able to join the batch.
+          
+          With a value too small, queries will always "steal" the compute to run `prefill` and running queries will be delayed by a lot.
+          
+          With a value too big, waiting queries could wait for a very long time before being allowed a slot in the running batch. If your server is busy that means that requests that could run in ~2s on an empty server could end up running in ~20s because the query had to wait for 18s.
+          
+          This number is expressed in number of tokens to make it a bit more "model" agnostic, but what should really matter is the overall latency for end users.
+          
+          [env: MAX_WAITING_TOKENS=]
+          [default: 20]
+
+```
+## MAX_BATCH_SIZE
+```shell
+      --max-batch-size <MAX_BATCH_SIZE>
+          Enforce a maximum number of requests per batch Specific flag for hardware targets that do not support unpadded inference
+          
+          [env: MAX_BATCH_SIZE=]
+
+```
+## CUDA_GRAPHS
+```shell
+      --cuda-graphs <CUDA_GRAPHS>
+          Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
+          
+          [env: CUDA_GRAPHS=]
+
+```
+## HOSTNAME
+```shell
+      --hostname <HOSTNAME>
+          The IP address to listen on
+          
+          [env: HOSTNAME=]
+          [default: 0.0.0.0]
+
+```
+## PORT
+```shell
+  -p, --port <PORT>
+          The port to listen on
+          
+          [env: PORT=]
+          [default: 3000]
+
+```
+## SHARD_UDS_PATH
+```shell
+      --shard-uds-path <SHARD_UDS_PATH>
+          The name of the socket for gRPC communication between the webserver and the shards
+          
+          [env: SHARD_UDS_PATH=]
+          [default: /tmp/text-generation-server]
+
+```
+## MASTER_ADDR
+```shell
+      --master-addr <MASTER_ADDR>
+          The address the master shard will listen on. (setting used by torch distributed)
+          
+          [env: MASTER_ADDR=]
+          [default: localhost]
+
+```
+## MASTER_PORT
+```shell
+      --master-port <MASTER_PORT>
+          The address the master port will listen on. (setting used by torch distributed)
+          
+          [env: MASTER_PORT=]
+          [default: 29500]
+
+```
+## HUGGINGFACE_HUB_CACHE
+```shell
+      --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          
+          [env: HUGGINGFACE_HUB_CACHE=]
+
+```
+## WEIGHTS_CACHE_OVERRIDE
+```shell
+      --weights-cache-override <WEIGHTS_CACHE_OVERRIDE>
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
+          
+          [env: WEIGHTS_CACHE_OVERRIDE=]
+
+```
+## DISABLE_CUSTOM_KERNELS
+```shell
+      --disable-custom-kernels
+          For some models (like bloom), text-generation-inference implemented custom cuda kernels to speed up inference. Those kernels were only tested on A100. Use this flag to disable them if you're running on different hardware and encounter issues
+          
+          [env: DISABLE_CUSTOM_KERNELS=]
+
+```
+## CUDA_MEMORY_FRACTION
+```shell
+      --cuda-memory-fraction <CUDA_MEMORY_FRACTION>
+          Limit the CUDA available memory. The allowed value equals the total visible memory multiplied by cuda-memory-fraction
+          
+          [env: CUDA_MEMORY_FRACTION=]
+          [default: 1.0]
+
+```
+## ROPE_SCALING
+```shell
+      --rope-scaling <ROPE_SCALING>
+          Rope scaling will only be used for RoPE models and allow rescaling the position rotary to accomodate for larger prompts.
+          
+          Goes together with `rope_factor`.
+          
+          `--rope-factor 2.0` gives linear scaling with a factor of 2.0 `--rope-scaling dynamic` gives dynamic scaling with a factor of 1.0 `--rope-scaling linear` gives linear scaling with a factor of 1.0 (Nothing will be changed basically)
+          
+          `--rope-scaling linear --rope-factor` fully describes the scaling you want
+          
+          [env: ROPE_SCALING=]
+          [possible values: linear, dynamic]
+
+```
+## ROPE_FACTOR
+```shell
+      --rope-factor <ROPE_FACTOR>
+          Rope scaling will only be used for RoPE models See `rope_scaling`
+          
+          [env: ROPE_FACTOR=]
+
+```
+## JSON_OUTPUT
+```shell
+      --json-output
+          Outputs the logs in JSON format (useful for telemetry)
+          
+          [env: JSON_OUTPUT=]
+
+```
+## OTLP_ENDPOINT
+```shell
+      --otlp-endpoint <OTLP_ENDPOINT>
+          [env: OTLP_ENDPOINT=]
+
+```
+## OTLP_SERVICE_NAME
+```shell
+      --otlp-service-name <OTLP_SERVICE_NAME>
+          [env: OTLP_SERVICE_NAME=]
+          [default: text-generation-inference.router]
+
+```
+## CORS_ALLOW_ORIGIN
+```shell
+      --cors-allow-origin <CORS_ALLOW_ORIGIN>
+          [env: CORS_ALLOW_ORIGIN=]
+
+```
+## WATERMARK_GAMMA
+```shell
+      --watermark-gamma <WATERMARK_GAMMA>
+          [env: WATERMARK_GAMMA=]
+
+```
+## WATERMARK_DELTA
+```shell
+      --watermark-delta <WATERMARK_DELTA>
+          [env: WATERMARK_DELTA=]
+
+```
+## NGROK
+```shell
+      --ngrok
+          Enable ngrok tunneling
+          
+          [env: NGROK=]
+
+```
+## NGROK_AUTHTOKEN
+```shell
+      --ngrok-authtoken <NGROK_AUTHTOKEN>
+          ngrok authentication token
+          
+          [env: NGROK_AUTHTOKEN=]
+
+```
+## NGROK_EDGE
+```shell
+      --ngrok-edge <NGROK_EDGE>
+          ngrok edge
+          
+          [env: NGROK_EDGE=]
+
+```
+## TOKENIZER_CONFIG_PATH
+```shell
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          The path to the tokenizer config file. This path is used to load the tokenizer configuration which may include a `chat_template`. If not provided, the default config will be used from the model hub
+          
+          [env: TOKENIZER_CONFIG_PATH=]
+
+```
+## DISABLE_GRAMMAR_SUPPORT
+```shell
+      --disable-grammar-support
+          Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
+          
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+
+```
+## ENV
+```shell
+  -e, --env
+          Display a lot of information about your runtime environment
+
+```
+## MAX_CLIENT_BATCH_SIZE
+```shell
+      --max-client-batch-size <MAX_CLIENT_BATCH_SIZE>
+          Control the maximum number of inputs that a client can send in a single request
+          
+          [env: MAX_CLIENT_BATCH_SIZE=]
+          [default: 4]
+
+```
+## LORA_ADAPTERS
+```shell
+      --lora-adapters <LORA_ADAPTERS>
+          Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during startup that will be available to callers via the `adapter_id` field in a request
+          
+          [env: LORA_ADAPTERS=]
+
+```
+## HELP
+```shell
+  -h, --help
+          Print help (see a summary with '-h')
+
+```
+## VERSION
+```shell
+  -V, --version
+          Print version
+
+```
diff --git a/docs/source/basic_tutorials/monitoring.md b/docs/source/basic_tutorials/monitoring.md
new file mode 100644
index 00000000..509b0aff
--- /dev/null
+++ b/docs/source/basic_tutorials/monitoring.md
@@ -0,0 +1,75 @@
+# Monitoring TGI server with Prometheus and Grafana dashboard
+
+TGI server deployment can easily be monitored through a Grafana dashboard, consuming a Prometheus data collection. Example of inspectable metrics are statistics on the effective batch sizes used by TGI, prefill/decode latencies, number of generated tokens, etc.
+
+In this tutorial, we look at how to set up a local Grafana dashboard to monitor TGI usage.
+
+![Grafana dashboard for TGI](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/grafana.png)
+
+## Setup on the server machine
+
+First, on your server machine, TGI needs to be launched as usual. TGI exposes [multiple](https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527) metrics that can be collected by Prometheus monitoring server.
+
+In the rest of this tutorial, we assume that TGI was launched through Docker with `--network host`.
+
+On the server where TGI is hosted, a Prometheus server needs to be installed and launched. To do so, please follow [Prometheus installation instructions](https://prometheus.io/download/#prometheus). For example, at the time of writing on a Linux machine:
+
+```
+wget https://github.com/prometheus/prometheus/releases/download/v2.52.0/prometheus-2.52.0.linux-amd64.tar.gz
+tar -xvzf prometheus-2.52.0.linux-amd64.tar.gz
+cd prometheus
+```
+
+Prometheus needs to be configured to listen on TGI's port. To do so, in Prometheus configuration file `prometheus.yml`, one needs to edit the lines:
+```
+    static_configs:
+      - targets: ["0.0.0.0:80"]
+```
+to use the correct IP address and port.
+
+We suggest to try `curl 0.0.0.0:80/generate -X POST -d '{"inputs":"hey chatbot, how are","parameters":{"max_new_tokens":15}}' -H 'Content-Type: application/json'` on the server side to make sure to configure the correct IP and port.
+
+Once Prometheus is configured, Prometheus server can be launched on the same machine where TGI is launched:
+```
+./prometheus --config.file="prometheus.yml"
+```
+
+In this guide, Prometheus monitoring data will be consumed on a local computer. Hence, we need to forward Prometheus port (by default 9090) to the local computer. To do so, we can for example:
+* Use ssh [local port forwarding](https://www.ssh.com/academy/ssh/tunneling-example)
+* Use ngrok port tunneling
+
+For simplicity, we will use [Ngrok](https://ngrok.com/docs/) in this guide to tunnel Prometheus port from the TGI server to the outside word.
+
+For that, you should follow the steps at https://dashboard.ngrok.com/get-started/setup/linux, and once Ngrok is installed, use:
+```bash
+ngrok http http://0.0.0.0:9090
+```
+
+As a sanity check, one can make sure that Prometheus server can be accessed at the URL given by Ngrok (in the style of https://d661-4-223-164-145.ngrok-free.app) from a local machine.
+
+## Setup on the monitoring machine
+
+Monitoring is typically done on an other machine than the server one. We use a Grafana dashboard to monitor TGI's server usage.
+
+Two options are available:
+* Use Grafana Cloud for an hosted dashboard solution (https://grafana.com/products/cloud/).
+* Self-host a grafana dashboard.
+
+In this tutorial, for simplicity, we will self host the dashbard. We recommend installing Grafana Open-source edition following [the official install instructions](https://grafana.com/grafana/download?platform=linux&edition=oss), using the available Linux binaries. For example:
+
+```bash
+wget https://dl.grafana.com/oss/release/grafana-11.0.0.linux-amd64.tar.gz
+tar -zxvf grafana-11.0.0.linux-amd64.tar.gz
+cd grafana-11.0.0
+./bin/grafana-server
+```
+
+Once the Grafana server is launched, the Grafana interface is available at http://localhost:3000. One needs to log in with the `admin` username and `admin` password.
+
+Once logged in, the Prometheus data source for Grafana needs to be configured, in the option `Add your first data source`. There, a Prometheus data source needs to be added with the Ngrok address we got earlier, that exposes Prometheus port (example: https://d661-4-223-164-145.ngrok-free.app).
+
+Once Prometheus data source is configured, we can finally create our dashboard! From home, go to `Create your first dashboard` and then `Import dashboard`. There, we will use the recommended dashboard template [tgi_grafana.json](https://github.com/huggingface/text-generation-inference/blob/main/assets/tgi_grafana.json) for a dashboard ready to be used, but you may configure your own dashboard as you like.
+
+Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
+
+Load your dashboard configuration, and your TGI dashboard should be ready to go!
diff --git a/docs/source/basic_tutorials/non_core_models.md b/docs/source/basic_tutorials/non_core_models.md
new file mode 100644
index 00000000..2badaff0
--- /dev/null
+++ b/docs/source/basic_tutorials/non_core_models.md
@@ -0,0 +1,24 @@
+# Non-core Model Serving
+
+TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
+
+You can serve these models using the same Docker command-line invocation as with fully supported models 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇
+
+```bash
+# Make sure your model is in the $volume directory
+docker run --shm-size 1g -p 8080:80 -v $volume:/data  ghcr.io/huggingface/text-generation-inference:latest --model-id /data/<PATH-TO-FOLDER>
+```
+
+You can refer to [transformers docs on custom models](https://huggingface.co/docs/transformers/main/en/custom_models) for more information.
diff --git a/docs/source/basic_tutorials/preparing_model.md b/docs/source/basic_tutorials/preparing_model.md
index 65a2a197..71ca5598 100644
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@@ -1,15 +1,15 @@
 # Preparing the Model
 
-Text Generation Inference improves the model in several aspects. 
+Text Generation Inference improves the model in several aspects.
 
 ## Quantization
 
-TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes) and [GPT-Q](https://arxiv.org/abs/2210.17323) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes` or `gptq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq).
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization)
 
 
 ## RoPE Scaling
 
-RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension. 
+RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.
 
 <Tip>
 
@@ -19,4 +19,4 @@ We recommend using `dynamic` RoPE scaling.
 
 ## Safetensors
 
-[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format. 
+[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.
diff --git a/docs/source/basic_tutorials/safety.md b/docs/source/basic_tutorials/safety.md
new file mode 100644
index 00000000..0b865db4
--- /dev/null
+++ b/docs/source/basic_tutorials/safety.md
@@ -0,0 +1,31 @@
+# Model safety.
+
+[Pytorch uses pickle](https://pytorch.org/docs/master/generated/torch.load.html) by default meaning that for quite a long while
+*Every* model using that format is potentially executing unintended code while purely loading the model.
+
+There is a big red warning on Python's page for pickle [link](https://docs.python.org/3/library/pickle.html) but for quite a while
+this was ignored by the community. Now that AI/ML is getting used much more ubiquitously we need to switch away from this format.
+
+HuggingFace is leading the effort here by creating a new format which contains pure data ([safetensors](https://github.com/huggingface/safetensors))
+and moving slowly but surely all the libs to make use of it by default.
+The move is intentionnally slow in order to make breaking changes as little impact as possible on users throughout.
+
+
+# TGI 2.0
+
+Since the release of TGI 2.0, we take the opportunity of this major version increase to break backward compatibility for these pytorch
+models (since they are a huge security risk for anyone deploying them).
+
+
+From now on, TGI will not convert automatically pickle files without having `--trust-remote-code` flag or `TRUST_REMOTE_CODE=true` in the environment variables.
+This flag is already used for community defined inference code, and is therefore quite representative of the level of confidence you are giving the model providers.
+
+
+If you want to use a model that uses pickle, but you still do not want to trust the authors entirely we recommend making a convertion on our space made for that.
+
+https://huggingface.co/spaces/safetensors/convert
+
+This space will create a PR on the original model, which you are use directly regardless of merge status from the original authors. Just use
+```
+docker run .... --revision refs/pr/#ID # Or use REVISION=refs/pr/#ID in the environment
+```
diff --git a/docs/source/basic_tutorials/train_medusa.md b/docs/source/basic_tutorials/train_medusa.md
new file mode 100644
index 00000000..ba2e43b7
--- /dev/null
+++ b/docs/source/basic_tutorials/train_medusa.md
@@ -0,0 +1,208 @@
+# Train Medusa
+
+This tutorial will show you how to train a Medusa model on a dataset of your choice. Please check out the [speculation documentation](../conceptual/speculation) for more information on how Medusa works and speculation in general.
+
+## What are the benefits of training a Medusa model?
+
+Training Medusa heads can greatly improve the speed of generation. Medusa adds extra "heads" to LLMs to predict multiple future tokens simultaneously. When augmenting a model with Medusa, the original model stays untouched, and only the new heads are fine-tuned during training.
+
+One of the most important things is to have a good dataset (with similar data to what will be used in production) because Medusa has a much higher hit-rate when the generation is in-domain.
+
+If you train Medusa on a dataset that is very different from the one you will use in production then the model will not be able to predict the future tokens accurately and consequently the speedup will be minimal or non-existent.
+
+## Self-distillation (Generating data for training)
+
+There are many methods for preparing data for training, but one of the easiest and most effective ways is to "self-distill" the data. This means that you can use the same model to generate the data that you will use to train the model.
+
+Essentially, you prompt the model with a similar input to what you will use in production and the model will generate the output.
+
+We'll use this output to help train the medusa heads to predict the `n+1`, `n+2`, `n+3`, etc tokens in the sequence.
+
+## Training
+
+The original implementation of Medusa is available at [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa) and we'll follow a very similar process to train the model as described on the original repository.
+
+### Getting Started
+
+There are two methods for training the model:
+
+- `torchrun` that is a wrapper around `torch.distributed.launch`
+- a forked version of `axlotl` that supports Medusa
+
+In this tutorial we'll use `torchrun` to train the model as it is the most straightforward way to train the model but similar steps can be followed to train the model using `axlotl` if you prefer.
+
+### Training with `torchrun`
+
+```bash
+mkdir medusa-training
+cd medusa-training
+
+pyenv install 3.10
+pyenv local 3.10
+
+uv venv -p 3.10
+source .venv/bin/activate
+```
+
+Now lets clone the original `Medusa` repository and install the library.
+
+```bash
+git clone https://github.com/FasterDecoding/Medusa.git
+cd Medusa
+pip install -e .
+```
+
+Next we'll need some data to train on, we can use the `ShareGPT_Vicuna_unfiltered` dataset that is available on the Hugging Face Hub.
+
+```bash
+apt install git-lfs
+git lfs install
+git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
+```
+
+Currently our directory structure looks like this:
+
+```bash
+.
+├── assets
+├── CITATION.cff
+├── create_data.py
+├── data_generation
+├── deepspeed.json
+├── last_run_prepared
+├── LICENSE
+├── llm_judge
+├── medusa
+├── medusa_llm.egg-info
+├── mistral.json
+├── notebooks
+├── pyproject.toml
+├── README.md
+├── ROADMAP.md
+├── scripts
+├── ShareGPT_Vicuna_unfiltered
+│   ├── README.md
+│   ├── ShareGPT_2023.05.04v0_Wasteland_Edition.json
+│   └── ShareGPT_V4.3_unfiltered_cleaned_split.json
+├── simple_gradio_interface.py
+├── tiny-llama.json
+└── vicuna_7b_qlora_stage1
+```
+
+## Start Training
+
+Now the lets generate the data and start training the model. This process will take a while since we are generating data from the model.
+
+First make sure you have an instance of TGI running with the model you want to use for self-distillation.
+
+```bash
+model=HuggingFaceH4/zephyr-7b-beta
+volume=/home/ubuntu/.cache/huggingface/hub/
+
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model
+```
+
+Now we can generate the data using the `create_data.py` script.
+
+```bash
+python create_data.py \
+    --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
+    --output-filename zephyr_self_distill.json
+```
+
+At this point our terminal should look like this:
+
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-large.gif"
+        width="550"
+    />
+</div>
+
+> Note: In the screen shot above we are only using a the first 500 examples from the dataset to speed up the process, you should have a much larger dataset for training.
+
+Now we can finally get to the fun part and start training the model!
+
+Using `torchrun` we can easily launch the `medusa` training script with the `zephyr_self_distill.json` configuration file.
+
+> NOTE: If you just self-distilled you may still have the model running, make sure to stop it before starting the training in order to allow all of the resources to be used for training.
+
+```bash
+WANDB_MODE=offline torchrun --nproc_per_node=4 medusa/train/train_legacy.py \
+    --model_name_or_path HuggingFaceH4/zephyr-7b-beta \
+    --data_path zephyr_self_distill.json \
+    --bf16 True \
+    --output_dir zephyr_out \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 4 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 1e-3 \
+    --weight_decay 0.0 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --medusa_num_heads 3 \
+    --medusa_num_layers 1 \
+    --deepspeed deepspeed.json
+```
+
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/medusa-train-heads-large.gif"
+        width="550"
+    />
+</div>
+
+If successful, you should see the similar output to the one below:
+
+```bash
+wandb: Run history:
+wandb:                    train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
+wandb:              train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
+wandb:            train/learning_rate ▅███▇▇▆▅▅▄▃▂▂▁▁▁
+wandb:                     train/loss ██▆▄▄▃▃▂▂▃▁▁▂▁▁▁
+wandb:             train/medusa0_loss ▆▆▇▆▆▅▄▅▃▃▃▃▂▂▂▂▂▃▂▂▂▁▁▁▂▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa0_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▄▄▄▃▄▃▄▄▅▅▆▅▆▆▇▅▇▇▄▇█▇▅▇█▆▇▇
+wandb:             train/medusa1_loss ▇▇█▇▇▆▅▅▃▄▃▃▃▃▃▃▃▃▃▃▂▁▂▂▂▁▁▂▁▁▇▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa1_top1 ▁▁▁▁▁▁▁▁▃▂▃▃▃▄▄▃▃▂▃▃▅▅▆▄█▆▇▅▇▇▅█▇▇▅▇█▆▆▇
+wandb:             train/medusa2_loss ▃▃▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▁▁▁▂▁▁▁▁▁
+wandb:             train/medusa2_top1 ▁▁▁▂▁▁▁▁▂▂▃▃▃▄▄▃▃▂▃▃▅▆▅▄█▆▆▅▆▆▄█▇▇▄▇█▆▆▇
+wandb:               train/total_flos ▁
+wandb:               train/train_loss ▁
+wandb:            train/train_runtime ▁
+wandb: train/train_samples_per_second ▁
+wandb:   train/train_steps_per_second ▁
+wandb:
+wandb: Run summary:
+wandb:                    train/epoch 2.0
+wandb:              train/global_step 16
+wandb:            train/learning_rate 0.0
+wandb:                     train/loss 14.8906
+wandb:             train/medusa0_loss 4.25
+wandb:             train/medusa0_top1 0.28809
+wandb:             train/medusa1_loss 4.8125
+wandb:             train/medusa1_top1 0.22727
+wandb:             train/medusa2_loss 5.5
+wandb:             train/medusa2_top1 0.17293
+wandb:               train/total_flos 0.0
+wandb:               train/train_loss 23.98242
+wandb:            train/train_runtime 396.9266
+wandb: train/train_samples_per_second 2.519
+wandb:   train/train_steps_per_second 0.04
+```
+
+Last but most importantly, don't forget to push this model to the Hugging Face Hub so you can use it in your projects.
+
+```bash
+python -m medusa.hf_utils \
+    --folder zephyr_out_medusa_mlp_zephyr-7b-beta_medusa_3_lr_0.001_layers_1 \
+    --repo drbh/zephyr_medusa_demo
+```
+
+Woo, we've successfully trained a Medusa model and pushed it to the Hugging Face Hub! 🎉
diff --git a/docs/source/basic_tutorials/using_cli.md b/docs/source/basic_tutorials/using_cli.md
index 82c10e6b..64554069 100644
--- a/docs/source/basic_tutorials/using_cli.md
+++ b/docs/source/basic_tutorials/using_cli.md
@@ -1,30 +1,30 @@
 # Using TGI CLI
 
-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](./installation#install-cli).
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).
 
-`text-generation-server` lets you download the model with `download-weights` command like below 👇 
+`text-generation-server` lets you download the model with `download-weights` command like below 👇
 
 ```bash
 text-generation-server download-weights MODEL_HUB_ID
 ```
 
-You can also use it to quantize models like below 👇 
+You can also use it to quantize models like below 👇
 
 ```bash
-text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR 
+text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
 ```
 
-You can use `text-generation-launcher` to serve models. 
+You can use `text-generation-launcher` to serve models.
 
 ```bash
 text-generation-launcher --model-id MODEL_HUB_ID --port 8080
 ```
 
-There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running 
+There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running
 
 ```bash
 text-generation-launcher --help
-``` 
+```
 
 You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).
 
diff --git a/docs/source/basic_tutorials/using_guidance.md b/docs/source/basic_tutorials/using_guidance.md
new file mode 100644
index 00000000..d0008fdb
--- /dev/null
+++ b/docs/source/basic_tutorials/using_guidance.md
@@ -0,0 +1,359 @@
+# Guidance
+
+Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developers guide LLM responses to fit their needs.
+
+These feature are available starting from version `1.4.3`. They are accessible via the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The tool support is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
+
+_note: guidance is supported as grammar in the `/generate` endpoint and as tools in the `/chat/completions` endpoint._
+
+## How it works
+
+TGI leverages the [outlines](https://github.com/outlines-dev/outlines) library to efficiently parse and compile the grammatical structures and tools specified by users. This integration transforms the defined grammars into an intermediate representation that acts as a framework to guide and constrain content generation, ensuring that outputs adhere to the specified grammatical rules.
+
+If you are interested in the technical details on how outlines is used in TGI, you can check out the [conceptual guidance documentation](../conceptual/guidance).
+
+## Table of Contents 📚
+
+### Grammar and Constraints
+
+- [The Grammar Parameter](#the-grammar-parameter): Shape your AI's responses with precision.
+- [Constrain with Pydantic](#constrain-with-pydantic): Define a grammar using Pydantic models.
+- [JSON Schema Integration](#json-schema-integration): Fine-grained control over your requests via JSON schema.
+- [Using the client](#using-the-client): Use TGI's client libraries to shape the AI's responses.
+
+### Tools and Functions
+
+- [The Tools Parameter](#the-tools-parameter): Enhance the AI's capabilities with predefined functions.
+- [Via the client](#text-generation-inference-client): Use TGI's client libraries to interact with the Messages API and Tool functions.
+- [OpenAI integration](#openai-integration): Use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+
+## Grammar and Constraints 🛣️
+
+### The Grammar Parameter
+
+In TGI `1.4.3`, we've introduced the grammar parameter, which allows you to specify the format of the response you want from the LLM.
+
+Using curl, you can make a request to TGI's Messages API with the grammar parameter. This is the most primitive way to interact with the API and using [Pydantic](#constrain-with-pydantic) is recommended for ease of use and readability.
+
+```json
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs": "I saw a puppy a cat and a raccoon during my bike ride in the park",
+    "parameters": {
+        "repetition_penalty": 1.3,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+// {"generated_text":"{ \n\n\"activity\": \"biking\",\n\"animals\": [\"puppy\",\"cat\",\"raccoon\"],\n\"animals_seen\": 3,\n\"location\": \"park\"\n}"}
+
+```
+
+### Hugging Face Hub Python Library
+
+The Hugging Face Hub Python library provides a client that makes it easy to interact with the Messages API. Here's an example of how to use the client to send a request with a grammar parameter.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+schema = {
+    "properties": {
+        "location": {"title": "Location", "type": "string"},
+        "activity": {"title": "Activity", "type": "string"},
+        "animals_seen": {
+            "maximum": 5,
+            "minimum": 1,
+            "title": "Animals Seen",
+            "type": "integer",
+        },
+        "animals": {"items": {"type": "string"}, "title": "Animals", "type": "array"},
+    },
+    "required": ["location", "activity", "animals_seen", "animals"],
+    "title": "Animals",
+    "type": "object",
+}
+
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {schema}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": schema},
+)
+
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+
+```
+
+A grammar can be defined using Pydantic models, JSON schemas, or regular expressions. The LLM will then generate a response that conforms to the specified grammar.
+
+> Note: A grammar must compile to an intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
+
+### Constrain with Pydantic
+
+Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
+
+```python
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel, conint
+from typing import List
+
+
+class Animals(BaseModel):
+    location: str
+    activity: str
+    animals_seen: conint(ge=1, le=5)  # Constrained integer type
+    animals: List[str]
+
+
+client = InferenceClient("http://localhost:3000")
+
+user_input = "I saw a puppy a cat and a raccoon during my bike ride in the park"
+resp = client.text_generation(
+    f"convert to JSON: 'f{user_input}'. please use the following schema: {Animals.schema()}",
+    max_new_tokens=100,
+    seed=42,
+    grammar={"type": "json", "value": Animals.schema()},
+)
+
+print(resp)
+# { "activity": "bike ride", "animals": ["puppy", "cat", "raccoon"], "animals_seen": 3, "location": "park" }
+
+
+```
+
+defining a grammar as regular expressions
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+regexp = "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)"
+
+resp = client.text_generation(
+    f"Whats Googles DNS? Please use the following regex: {regexp}",
+    seed=42,
+    grammar={
+        "type": "regex",
+        "value": regexp,
+    },
+)
+
+
+print(resp)
+# 7.1.1.1
+
+```
+
+## Tools and Functions 🛠️
+
+### The Tools Parameter
+
+In addition to the grammar parameter, we've also introduced a set of tools and functions to help you get the most out of the Messages API.
+
+Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the LLM's capabilities. Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+
+Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+
+```json
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "model": "tgi",
+    "messages": [
+        {
+            "role": "user",
+            "content": "What is the weather like in New York?"
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA"
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location."
+                        }
+                    },
+                    "required": ["location", "format"]
+                }
+            }
+        }
+    ],
+    "tool_choice": "get_current_weather"
+}'
+// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
+```
+
+### Chat Completion with Tools
+
+Grammars are supported in the `/generate` endpoint, while tools are supported in the `/chat/completions` endpoint. Here's an example of how to use the client to send a request with a tool parameter.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://localhost:3000")
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                },
+                "required": ["location", "format"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_n_day_weather_forecast",
+            "description": "Get an N-day weather forecast",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                    "num_days": {
+                        "type": "integer",
+                        "description": "The number of days to forecast",
+                    },
+                },
+                "required": ["location", "format", "num_days"],
+            },
+        },
+    },
+]
+
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "system",
+            "content": "You're a helpful assistant! Answer the users question best you can.",
+        },
+        {
+            "role": "user",
+            "content": "What is the weather like in Brooklyn, New York?",
+        },
+    ],
+    tools=tools,
+    seed=42,
+    max_tokens=100,
+)
+
+print(chat.choices[0].message.tool_calls)
+# [ChatCompletionOutputToolCall(function=ChatCompletionOutputFunctionDefinition(arguments={'format': 'fahrenheit', 'location': 'Brooklyn, New York', 'num_days': 7}, name='get_n_day_weather_forecast', description=None), id=0, type='function')]
+
+```
+
+### OpenAI integration
+
+TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+
+However there are some minor differences in the API, for example `tool_choice="auto"` will ALWAYS choose the tool for you. This is different from OpenAI's API where `tool_choice="auto"` will choose a tool if the model thinks it's necessary.
+
+```python
+from openai import OpenAI
+
+# Initialize the client, pointing it to one of the available models
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="_",
+)
+
+# NOTE: tools defined above and removed for brevity
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "system",
+            "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
+        },
+        {
+            "role": "user",
+            "content": "What's the weather like the next 3 days in San Francisco, CA?",
+        },
+    ],
+    tools=tools,
+    tool_choice="auto",  # tool selected by model
+    max_tokens=500,
+)
+
+
+called = chat_completion.choices[0].message.tool_calls
+print(called)
+# {
+#     "id": 0,
+#     "type": "function",
+#     "function": {
+#         "description": None,
+#         "name": "tools",
+#         "parameters": {
+#             "format": "celsius",
+#             "location": "San Francisco, CA",
+#             "num_days": 3,
+#         },
+#     },
+# }
+```
diff --git a/docs/source/basic_tutorials/visual_language_models.md b/docs/source/basic_tutorials/visual_language_models.md
new file mode 100644
index 00000000..3770db0b
--- /dev/null
+++ b/docs/source/basic_tutorials/visual_language_models.md
@@ -0,0 +1,230 @@
+# Vision Language Model Inference in TGI
+
+Visual Language Model (VLM) are models that consume both image and text inputs to generate text.
+
+VLM's are trained on a combination of image and text data and can handle a wide range of tasks, such as image captioning, visual question answering, and visual dialog.
+
+> What distinguishes VLMs from other text and image models is their ability to handle long context and generate text that is coherent and relevant to the image even after multiple turns or in some cases, multiple images.
+
+Below are couple of common use cases for vision language models:
+
+- **Image Captioning**: Given an image, generate a caption that describes the image.
+- **Visual Question Answering (VQA)**: Given an image and a question about the image, generate an answer to the question.
+- **Mulimodal Dialog**: Generate response to multiple turns of images and conversations.
+- **Image Information Retrieval**: Given an image, retrieve information from the image.
+
+## How to Use a Vision Language Model?
+
+### Hugging Face Hub Python Library
+
+To infer with vision language models through Python, you can use the [`huggingface_hub`](https://pypi.org/project/huggingface-hub/) library. The `InferenceClient` class provides a simple way to interact with the [Inference API](https://huggingface.co/docs/api-inference/index). Images can be passed as URLs or base64-encoded strings. The `InferenceClient` will automatically detect the image format.
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://127.0.0.1:3000")
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+prompt = f"![]({image})What is this a picture of?\n\n"
+for token in client.text_generation(prompt, max_new_tokens=16, stream=True):
+    print(token)
+
+# This is a picture of an anthropomorphic rabbit in a space suit.
+```
+
+```python
+from huggingface_hub import InferenceClient
+import base64
+import requests
+import io
+
+client = InferenceClient("http://127.0.0.1:3000")
+
+# read image from local file
+image_path = "rabbit.png"
+with open(image_path, "rb") as f:
+    image = base64.b64encode(f.read()).decode("utf-8")
+
+image = f"data:image/png;base64,{image}"
+prompt = f"![]({image})What is this a picture of?\n\n"
+
+for token in client.text_generation(prompt, max_new_tokens=10, stream=True):
+    print(token)
+
+# This is a picture of an anthropomorphic rabbit in a space suit.
+```
+
+or via the `chat_completion` endpoint:
+
+```python
+from huggingface_hub import InferenceClient
+
+client = InferenceClient("http://127.0.0.1:3000")
+
+chat = client.chat_completion(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    seed=42,
+    max_tokens=100,
+)
+
+print(chat)
+# ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='length', index=0, message=ChatCompletionOutputMessage(role='assistant', content=" The image you've provided features an anthropomorphic rabbit in spacesuit attire. This rabbit is depicted with human-like posture and movement, standing on a rocky terrain with a vast, reddish-brown landscape in the background. The spacesuit is detailed with mission patches, circuitry, and a helmet that covers the rabbit's face and ear, with an illuminated red light on the chest area.\n\nThe artwork style is that of a", name=None, tool_calls=None), logprobs=None)], created=1714589614, id='', model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=2943, total_tokens=3043))
+
+```
+
+or with OpenAi's library:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(base_url="http://localhost:3000/v1", api_key="-")
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Whats in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+                    },
+                },
+            ],
+        },
+    ],
+    stream=False,
+)
+
+print(chat_completion)
+# ChatCompletion(id='', choices=[Choice(finish_reason='eos_token', index=0, logprobs=None, message=ChatCompletionMessage(content=' The image depicts an anthropomorphic rabbit dressed in a space suit with gear that resembles NASA attire. The setting appears to be a solar eclipse with dramatic mountain peaks and a partial celestial body in the sky. The artwork is detailed and vivid, with a warm color palette and a sense of an adventurous bunny exploring or preparing for a journey beyond Earth. ', role='assistant', function_call=None, tool_calls=None))], created=1714589732, model='llava-hf/llava-v1.6-mistral-7b-hf', object='text_completion', system_fingerprint='2.0.2-native', usage=CompletionUsage(completion_tokens=84, prompt_tokens=2943, total_tokens=3027))
+```
+
+### Inference Through Sending `cURL` Requests
+
+To use the `generate_stream` endpoint with curl, you can add the `-N` flag. This flag disables curl default buffering and shows data as it arrives from the server.
+
+```bash
+curl -N 127.0.0.1:3000/generate_stream \
+    -X POST \
+    -d '{"inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n","parameters":{"max_new_tokens":16, "seed": 42}}' \
+    -H 'Content-Type: application/json'
+
+# ...
+# data:{"index":16,"token":{"id":28723,"text":".","logprob":-0.6196289,"special":false},"generated_text":"This is a picture of an anthropomorphic rabbit in a space suit.","details":null}
+```
+
+### Inference Through JavaScript
+
+First, we need to install the `@huggingface/inference` library.
+
+```bash
+npm install @huggingface/inference
+```
+
+If you're using the free Inference API, you can use [Huggingface.js](https://huggingface.co/docs/huggingface.js/inference/README)'s `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint` class to easily interact with the Inference API.
+
+We can create a `HfInferenceEndpoint` providing our endpoint URL and We can create a `HfInferenceEndpoint` providing our endpoint URL and [Hugging Face access token](https://huggingface.co/settings/tokens).
+
+```js
+import { HfInferenceEndpoint } from "@huggingface/inference";
+
+const hf = new HfInferenceEndpoint("http://127.0.0.1:3000", "HF_TOKEN");
+
+const prompt =
+  "![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n";
+
+const stream = hf.textGenerationStream({
+  inputs: prompt,
+  parameters: { max_new_tokens: 16, seed: 42 },
+});
+for await (const r of stream) {
+  // yield the generated token
+  process.stdout.write(r.token.text);
+}
+
+// This is a picture of an anthropomorphic rabbit in a space suit.
+```
+
+## Combining Vision Language Models with Other Features
+
+VLMs in TGI have several advantages, for example these models can be used in tandem with other features for more complex tasks. For example, you can use VLMs with [Guided Generation](/docs/conceptual/guided-generation) to generate specific JSON data from an image.
+
+<div class="flex justify-center">
+    <img
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"
+        width="400"
+    />
+</div>
+
+For example we can extract information from the rabbit image and generate a JSON object with the location, activity, number of animals seen, and the animals seen. That would look like this:
+
+```json
+{
+  "activity": "Standing",
+  "animals": ["Rabbit"],
+  "animals_seen": 1,
+  "location": "Rocky surface with mountains in the background and a red light on the rabbit's chest"
+}
+```
+
+All we need to do is provide a JSON schema to the VLM model and it will generate the JSON object for us.
+
+```bash
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs":"![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)What is this a picture of?\n\n",
+    "parameters": {
+        "max_new_tokens": 100,
+        "seed": 42,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+
+# {
+#   "generated_text": "{ \"activity\": \"Standing\", \"animals\": [ \"Rabbit\" ], \"animals_seen\": 1, \"location\": \"Rocky surface with mountains in the background and a red light on the rabbit's chest\" }"
+# }
+```
+
+Want to learn more about how Vision Language Models work? Check out the [awesome blog post on the topic](https://huggingface.co/blog/vlms).
diff --git a/docs/source/conceptual/flash_attention.md b/docs/source/conceptual/flash_attention.md
new file mode 100644
index 00000000..6b13cd13
--- /dev/null
+++ b/docs/source/conceptual/flash_attention.md
@@ -0,0 +1,11 @@
+# Flash Attention
+
+Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference.
+
+Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back.
+
+![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png)
+
+It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix.
+
+You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135).
diff --git a/docs/source/conceptual/guidance.md b/docs/source/conceptual/guidance.md
new file mode 100644
index 00000000..3059e3de
--- /dev/null
+++ b/docs/source/conceptual/guidance.md
@@ -0,0 +1,86 @@
+# Guidance
+
+## What is Guidance?
+
+Guidance is a feature that allows users to constrain the generation of a large language model with a specified grammar. This feature is particularly useful when you want to generate text that follows a specific structure or uses a specific set of words or produce output in a specific format. A prominent example is JSON grammar, where the model is forced to output valid JSON.
+
+## How is it used?
+
+Guidance can be implemented in many ways and the community is always finding new ways to use it. Here are some examples of how you can use guidance:
+
+Technically, guidance can be used to generate:
+
+- a specific JSON object
+- a function signature
+- typed output like a list of integers
+
+However these use cases can span a wide range of applications, such as:
+
+- extracting structured data from unstructured text
+- summarizing text into a specific format
+- limit output to specific classes of words (act as a LLM powered classifier)
+- generate the input to specific APIs or services
+- provide reliable and consistent output for downstream tasks
+- extract data from multimodal inputs
+
+## How it works?
+
+Diving into the details, guidance is enabled by including a grammar with a generation request that is compiled, and used to modify the chosen tokens.
+
+This process can be broken down into the following steps:
+
+1. A request is sent to the backend, it is processed and placed in batch. Processing includes compiling the grammar into a finite state machine and a grammar state.
+
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/request-to-batch.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/request-to-batch-dark.gif"
+    />
+</div>
+
+2. The model does a forward pass over the batch. This returns probabilities for each token in the vocabulary for each request in the batch.
+
+3. The process of choosing one of those tokens is called `sampling`. The model samples from the distribution of probabilities to choose the next token. In TGI all of the steps before sampling are called `processor`. Grammars are applied as a processor that masks out tokens that are not allowed by the grammar.
+
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/logit-grammar-mask.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/logit-grammar-mask-dark.gif"
+    />
+</div>
+
+4. The grammar mask is applied and the model samples from the remaining tokens. Once a token is chosen, we update the grammar state with the new token, to prepare it for the next pass.
+
+<div class="flex justify-center">
+    <img
+        class="block dark:hidden"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/sample-logits.gif"
+    />
+    <img
+        class="hidden dark:block"
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/sample-logits-dark.gif"
+    />
+</div>
+
+## How to use Guidance?
+
+There are two main ways to use guidance; you can either use the `/generate` endpoint with a grammar or use the `/chat/completion` endpoint with tools.
+
+Under the hood tools are a special case of grammars that allows the model to choose one or none of the provided tools.
+
+Please refer to [using guidance](../basic_tutorials/using_guidance) for more examples and details on how to use guidance in Python, JavaScript, and cURL.
+
+### Getting the most out of guidance
+
+Depending on how you are using guidance, you may want to make use of different features. Here are some tips to get the most out of guidance:
+
+- If you are using the `/generate` with a `grammar` it is recommended to include the grammar in the prompt prefixed by something like `Please use the following JSON schema to generate the output:`. This will help the model understand the context of the grammar and generate the output accordingly.
+- If you are getting a response with many repeated tokens, please use the `frequency_penalty` or `repetition_penalty` to reduce the number of repeated tokens in the output.
diff --git a/docs/source/conceptual/lora.md b/docs/source/conceptual/lora.md
new file mode 100644
index 00000000..08df767c
--- /dev/null
+++ b/docs/source/conceptual/lora.md
@@ -0,0 +1,65 @@
+# LoRA (Low-Rank Adaptation)
+
+## What is LoRA?
+
+LoRA is a technique that allows for efficent fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.
+
+LoRA works by adding a small number of additional weights to the model, which are used to adapt the model to the new dataset or task. These additional weights are learned during the fine-tuning process, while the rest of the model's weights are kept fixed.
+
+## How is it used?
+
+LoRA can be used in many ways and the community is always finding new ways to use it. Here are some examples of how you can use LoRA:
+
+Technically, LoRA can be used to fine-tune a large language model on a small dataset. However, these use cases can span a wide range of applications, such as:
+
+- fine-tuning a language model on a small dataset
+- fine-tuning a language model on a domain-specific dataset
+- fine-tuning a language model on a dataset with limited labels
+
+## Optimizing Inference with LoRA
+
+LoRA's can be used during inference by mutliplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with mulitple LoRA models.
+
+## Serving multiple LoRA adapters with TGI
+
+Once a LoRA model has been trained, it can be used to generate text or perform other tasks just like a regular language model. However, because the model has been fine-tuned on a specific dataset, it may perform better on that dataset than a model that has not been fine-tuned.
+
+In practice its often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.
+
+Text Generation Inference (TGI) now supports loading multiple LoRA models at startup that can be used in generation requests. This feature is available starting from version `~2.0.6` and is compatible with LoRA models trained using the `peft` library.
+
+### Specifying LoRA models
+
+To use LoRA in TGI, when starting the server, you can specify the list of LoRA models to load using the `LORA_ADAPTERS` environment variable. For example:
+
+```bash
+LORA_ADAPTERS=predibase/customer_support,predibase/dbpedia
+```
+
+In the server logs, you will see the following message:
+
+```txt
+Loading adapter weights into model: predibase/customer_support
+Loading adapter weights into model: predibase/dbpedia
+```
+
+## Generate text
+
+You can then use these models in generation requests by specifying the `lora_model` parameter in the request payload. For example:
+
+```json
+curl 127.0.0.1:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+  "inputs": "Hello who are you?",
+  "parameters": {
+    "max_new_tokens": 40,
+    "adapter_id": "predibase/customer_support"
+  }
+}'
+```
+
+> **Note:** The Lora feature is new and still being improved. If you encounter any issues or have any feedback, please let us know by opening an issue on the [GitHub repository](https://github.com/huggingface/text-generation-inference/issues/new/choose). Additionally documentation and an improved client library will be published soon.
+
+An updated tutorial with detailed examples will be published soon. Stay tuned!
diff --git a/docs/source/conceptual/paged_attention.md b/docs/source/conceptual/paged_attention.md
new file mode 100644
index 00000000..3fb2dcd8
--- /dev/null
+++ b/docs/source/conceptual/paged_attention.md
@@ -0,0 +1,9 @@
+# PagedAttention
+
+LLMs struggle with memory limitations during generation. In the decoding part of generation, all the attention keys and values generated for previous tokens are stored in GPU memory for reuse. This is called _KV cache_, and it may take up a large amount of memory for large models and long sequences.
+
+PagedAttention attempts to optimize memory use by partitioning the KV cache into blocks that are accessed through a lookup table. Thus, the KV cache does not need to be stored in contiguous memory, and blocks are allocated as needed. The memory efficiency can increase GPU utilization on memory-bound workloads, so more inference batches can be supported.
+
+The use of a lookup table to access the memory blocks can also help with KV sharing across multiple generations. This is helpful for techniques such as _parallel sampling_, where multiple outputs are generated simultaneously for the same prompt. In this case, the cached KV blocks can be shared among the generations.
+
+TGI's PagedAttention implementation leverages the custom cuda kernels developed by the [vLLM Project](https://github.com/vllm-project/vllm). You can learn more about this technique in the [project's page](https://vllm.ai/).
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
new file mode 100644
index 00000000..8f26fdba
--- /dev/null
+++ b/docs/source/conceptual/quantization.md
@@ -0,0 +1,59 @@
+# Quantization
+
+TGI offers GPTQ and bits-and-bytes quantization to quantize large language models.
+
+## Quantization with GPTQ
+
+GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇
+
+Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):
+
+$$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
+
+
+TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
+```
+
+Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
+
+To quantize a given model using GPTQ with a calibration dataset, simply run
+
+```bash
+text-generation-server quantize tiiuae/falcon-40b /data/falcon-40b-gptq
+# Add --upload-to-model-id MYUSERNAME/falcon-40b to push the created model to the hub directly
+```
+
+This will create a new directory with the quantized files which you can use with,
+
+```bash
+text-generation-launcher --model-id /data/falcon-40b-gptq/ --sharded true --num-shard 2 --quantize gptq
+```
+
+You can learn more about the quantization options by running `text-generation-server quantize --help`.
+
+If you wish to do more with GPTQ models (e.g. train an adapter on top), you can read about transformers GPTQ integration [here](https://huggingface.co/blog/gptq-integration).
+You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.pdf).
+
+## Quantization with bitsandbytes
+
+bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
+
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
+In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes
+```
+
+4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
+
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
+
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
+```
+
+You can get more information about 8-bit quantization by reading this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), and 4-bit quantization by reading [this blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
diff --git a/docs/source/conceptual/safetensors.md b/docs/source/conceptual/safetensors.md
new file mode 100644
index 00000000..8ede20fe
--- /dev/null
+++ b/docs/source/conceptual/safetensors.md
@@ -0,0 +1,7 @@
+# Safetensors
+
+Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).
+
+TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.
+
+You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
diff --git a/docs/source/conceptual/speculation.md b/docs/source/conceptual/speculation.md
new file mode 100644
index 00000000..45618ae3
--- /dev/null
+++ b/docs/source/conceptual/speculation.md
@@ -0,0 +1,49 @@
+## Speculation
+
+
+Speculative decoding, assisted generation, Medusa, and others are a few different names for the same idea.
+The idea is to generate tokens *before* the large model actually runs, and only *check* if those tokens where valid.
+
+So you are making *more* computations on your LLM, but if you are correct you produce 1, 2, 3 etc.. tokens on a single LLM pass. Since LLMs are usually memory bound (and not compute bound), provided your guesses are correct enough, this is a 2-3x faster inference (It can be much more for code oriented tasks for instance).
+
+You can check a more [detailed explanation](https://huggingface.co/blog/assisted-generation).
+
+Text-generation inference supports 2 main speculative methods:
+
+- Medusa
+- N-gram
+
+
+### Medusa
+
+
+Medusa is a [simple method](https://arxiv.org/abs/2401.10774) to create many tokens in a single pass using fine-tuned LM heads in addition to your existing models.
+
+
+You can check a few existing  fine-tunes for popular models:
+
+- [text-generation-inference/gemma-7b-it-medusa](https://huggingface.co/text-generation-inference/gemma-7b-it-medusa)
+- [text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa](https://huggingface.co/text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa)
+- [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
+
+
+In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [../basic_tutorials/train_medusa.md](../basic_tutorials/train_medusa.md)
+
+
+In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.
+
+
+### N-gram
+
+
+If you don't have a medusa model, or don't have the resource to fine-tune, you can try to use `n-gram`.
+N-gram works by trying to find matching tokens in the previous sequence, and use those as speculation for generating new tokens. For example, if the tokens "np.mean" appear multiple times in the sequence, the model can speculate that the next continuation of the tokens "np." is probably also "mean".
+
+This is an extremely simple method, which works best for code, or highly repetitive text. This might not be beneficial, if the speculation misses too much.
+
+
+In order to enable n-gram speculation simply use
+
+`--speculate 2` in your flags.
+
+[Details about the flag](https://huggingface.co/docs/text-generation-inference/basic_tutorials/launcher#speculate)
diff --git a/docs/source/conceptual/streaming.md b/docs/source/conceptual/streaming.md
index c20d76e0..71ec9b25 100644
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@@ -5,17 +5,17 @@
 Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.
 
 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"
     />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"
     />
 </div>
 
-With token streaming, the server can start returning the tokens one by one before having to generate the whole response. Users can have a sense of the generation's quality earlier than the end of the generation. This has different positive effects:
+With token streaming, the server can start returning the tokens one by one before having to generate the whole response. Users can have a sense of the generation's quality before the end of the generation. This has different positive effects:
 
 * Users can get results orders of magnitude earlier for extremely long queries.
 * Seeing something in progress allows users to stop the generation if it's not going in the direction they expect.
@@ -25,14 +25,14 @@ With token streaming, the server can start returning the tokens one by one befor
 For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below.
 
 <div class="block dark:hidden">
-	<iframe 
+	<iframe
         src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
         width="850"
         height="350"
     ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
         src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
         width="850"
         height="350"
@@ -43,7 +43,7 @@ For example, a system can generate 100 tokens per second. If the system generate
 
 ### Streaming with Python
 
-To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response. 
+To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response.
 
 ```python
 from huggingface_hub import InferenceClient
@@ -116,20 +116,20 @@ curl -N 127.0.0.1:8080/generate_stream \
 First, we need to install the `@huggingface/inference` library.
 `npm install @huggingface/inference`
 
-If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's 
+If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`.
 
 We can create a `HfInferenceEndpoint` providing our endpoint URL and credential.
 
 ```js
-import { HfInference } from '@huggingface/inference'
+import { HfInferenceEndpoint } from '@huggingface/inference'
 
-const hf = new HfInference('https://YOUR_ENDPOINT.endpoints.huggingface.cloud', 'hf_YOUR_TOKEN')
+const hf = new HfInferenceEndpoint('https://YOUR_ENDPOINT.endpoints.huggingface.cloud', 'hf_YOUR_TOKEN')
 
 // prompt
 const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips'
 
 const stream = hf.textGenerationStream({ inputs: prompt })
-for await (const r of stream) { 
+for await (const r of stream) {
   // yield the generated token
   process.stdout.write(r.token.text)
 }
@@ -143,6 +143,4 @@ SSEs are different than:
 * Polling: where the client keeps calling the server to get data. This means that the server might return empty responses and cause overhead.
 * Webhooks: where there is a bi-directional connection. The server can send information to the client, but the client can also send data to the server after the first request. Webhooks are more complex to operate as they don’t only use HTTP.
 
-One of the limitations of Server-Sent Events is that they limit how many concurrent requests can handle by the server. Instead of timing out when there are too many SSE connections, TGI returns a HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g. it could display a busy error to the user or it could retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing to handle backpressure.
-
-One of the limitations of Server-Sent Events is that they limit how many concurrent requests can handle by the server. Instead of timing out when there are too many SSE connections, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure.
+If there are too many requests at the same time, TGI returns an HTTP Error with an `overloaded` error type (`huggingface_hub` returns `OverloadedError`). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify `--max_concurrent_requests`, allowing clients to handle backpressure.
diff --git a/docs/source/conceptual/tensor_parallelism.md b/docs/source/conceptual/tensor_parallelism.md
new file mode 100644
index 00000000..2c241c41
--- /dev/null
+++ b/docs/source/conceptual/tensor_parallelism.md
@@ -0,0 +1,14 @@
+# Tensor Parallelism
+
+Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇
+
+![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)
+
+
+<Tip warning={true}>
+
+Tensor Parallelism only works for [models officially supported](../supported_models), it will not work when falling back to `transformers`. You can get more information about unsupported models [here](../basic_tutorials/non_core_models).
+
+</Tip>
+
+You can learn a lot more details about tensor-parallelism from [the `transformers` docs](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism).
diff --git a/docs/source/index.md b/docs/source/index.md
index 097217ad..309442b1 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -18,7 +18,8 @@ Text Generation Inference implements many optimizations and features, such as:
 - Logits warper (temperature scaling, top-p, top-k, repetition penalty)
 - Stop sequences
 - Log probabilities
-
+- Fine-tuning Support: Utilize fine-tuned models for specific tasks to achieve higher accuracy and performance.
+- [Guidance](../conceptual/guidance): Enable function calling and tool-use by forcing the model to generate structured outputs based on your own predefined output schemas.
 
 Text Generation Inference is used in production by multiple projects, such as:
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 1301b930..b6c24d55 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,10 +1,14 @@
-# Installation
+# Installation from source
 
-This section explains how to install the CLI tool as well as installing TGI from source. **The strongly recommended approach is to use Docker, as it does not require much setup. Check [the Quick Tour](./quicktour) to learn how to run TGI with Docker.**
+<Tip warning={true}>
+
+Installing TGI from source is not the recommended usage. We strongly recommend to use TGI through Docker, check the [Quick Tour](./quicktour), [Installation for Nvidia GPUs](./installation_nvidia) and [Installation for AMD GPUs](./installation_amd) to learn how to use TGI with Docker.
+
+</Tip>
 
 ## Install CLI
 
-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. 
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters.
 
 To install the CLI, you need to first clone the TGI repository and then run `make`.
 
@@ -23,7 +27,7 @@ BUILD_EXTENSIONS=True make install
 
 Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**.
 
-Text Generation Inference is available on pypi, conda and GitHub. 
+Text Generation Inference is available on pypi, conda and GitHub.
 
 To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using conda:
diff --git a/docs/source/installation_amd.md b/docs/source/installation_amd.md
new file mode 100644
index 00000000..33d85732
--- /dev/null
+++ b/docs/source/installation_amd.md
@@ -0,0 +1,38 @@
+# Using TGI with AMD GPUs
+
+TGI is supported and tested on [AMD Instinct MI210](https://www.amd.com/en/products/accelerators/instinct/mi200/mi210.html), [MI250](https://www.amd.com/en/products/accelerators/instinct/mi200/mi250.html) and [MI300](https://www.amd.com/en/products/accelerators/instinct/mi300.html) GPUs. The support may be extended in the future. The recommended usage is through Docker. Make sure to check the [AMD documentation](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html) on how to use Docker with AMD GPUs.
+
+On a server powered by AMD GPUs, TGI can be launched with the following command:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --rm -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
+    --device=/dev/kfd --device=/dev/dri --group-add video \
+    --ipc=host --shm-size 256g --net host -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1-rocm \
+    --model-id $model
+```
+
+The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
+
+## TunableOp
+
+TGI's docker image for AMD GPUs integrates [PyTorch's TunableOp](https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable), which allows to do an additional warmup to select the best performing matrix multiplication (GEMM) kernel from rocBLAS or hipBLASLt.
+
+Experimentally, on MI300X, we noticed a 6-8% latency improvement when using TunableOp on top of ROCm 6.1 and PyTorch 2.3.
+
+TunableOp is enabled by default, the warmup may take 1-2 minutes. In case you would like to disable TunableOp, please pass `--env PYTORCH_TUNABLEOP_ENABLED="0"` when launcher TGI's docker container.
+
+## Flash attention implementation
+
+Two implementations of Flash Attention are available for ROCm, the first is [ROCm/flash-attention](https://github.com/ROCm/flash-attention) based on a [Composable Kernel](https://github.com/ROCm/composable_kernel) (CK) implementation, and the second is a [Triton implementation](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/layers/attention/flash_attn_triton.py).
+
+By default, the Composable Kernel implementation is used. However, the Triton implementation has slightly lower latency on MI250 and MI300, but requires a warmup which can be prohibitive as it needs to be done again for each new prompt length. If needed, FA Triton impelmentation can be enabled with `--env ROCM_USE_FLASH_ATTN_V2_TRITON="0"` when launching TGI's docker container.
+
+## Unsupported features
+
+The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
+* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
+* Kernel for sliding window attention (Mistral)
diff --git a/docs/source/installation_gaudi.md b/docs/source/installation_gaudi.md
new file mode 100644
index 00000000..1ddf2b47
--- /dev/null
+++ b/docs/source/installation_gaudi.md
@@ -0,0 +1,3 @@
+# Using TGI with Intel Gaudi
+
+Check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index).
diff --git a/docs/source/installation_inferentia.md b/docs/source/installation_inferentia.md
new file mode 100644
index 00000000..0394e6de
--- /dev/null
+++ b/docs/source/installation_inferentia.md
@@ -0,0 +1,3 @@
+# Using TGI with Inferentia
+
+Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
diff --git a/docs/source/installation_nvidia.md b/docs/source/installation_nvidia.md
new file mode 100644
index 00000000..4de6cb19
--- /dev/null
+++ b/docs/source/installation_nvidia.md
@@ -0,0 +1,18 @@
+# Using TGI with Nvidia GPUs
+
+TGI optimized models are supported on NVIDIA [H100](https://www.nvidia.com/en-us/data-center/h100/), [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it.
+
+For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.
+
+TGI can be used on NVIDIA GPUs through its official docker image:
+
+```bash
+model=teknium/OpenHermes-2.5-Mistral-7B
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+
+docker run --gpus all --shm-size 64g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1 \
+    --model-id $model
+```
+
+The launched TGI server can then be queried from clients, make sure to check out the [Consuming TGI](./basic_tutorials/consuming_tgi) guide.
diff --git a/docs/source/messages_api.md b/docs/source/messages_api.md
new file mode 100644
index 00000000..250aaae2
--- /dev/null
+++ b/docs/source/messages_api.md
@@ -0,0 +1,175 @@
+# Messages API
+
+Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
+
+#### Table of Contents
+
+- [Making a Request](#making-a-request)
+- [Streaming](#streaming)
+- [Synchronous](#synchronous)
+- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
+- [Cloud Providers](#cloud-providers)
+  - [Amazon SageMaker](#amazon-sagemaker)
+
+## Making a Request
+
+You can make a request to TGI's Messages API using `curl`. Here's an example:
+
+```bash
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
+    -H 'Content-Type: application/json'
+```
+
+## Streaming
+
+You can also use OpenAI's Python client library to make a streaming request. Here's how:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message)
+```
+
+## Synchronous
+
+If you prefer to make a synchronous request, you can do so like this:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=False
+)
+
+print(chat_completion)
+```
+
+## Hugging Face Inference Endpoints
+
+The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
+Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library:
+
+> **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key.
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    # replace with your endpoint url, make sure to include "v1/" at the end
+    base_url="https://vlzz10eq3fol3429.us-east-1.aws.endpoints.huggingface.cloud/v1/",
+    # replace with your API key
+    api_key="hf_XXX"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message.choices[0].delta.content, end="")
+```
+
+## Cloud Providers
+
+TGI can be deployed on various cloud providers for scalable and robust text generation. One such provider is Amazon SageMaker, which has recently added support for TGI. Here's how you can deploy TGI on Amazon SageMaker:
+
+## Amazon SageMaker
+
+To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
+
+This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+
+```python
+import json
+import sagemaker
+import boto3
+from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
+
+try:
+ role = sagemaker.get_execution_role()
+except ValueError:
+ iam = boto3.client('iam')
+ role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
+
+# Hub Model configuration. https://huggingface.co/models
+hub = {
+ 'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
+ 'SM_NUM_GPUS': json.dumps(1),
+ 'MESSAGES_API_ENABLED': True
+}
+
+# create Hugging Face Model Class
+huggingface_model = HuggingFaceModel(
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ env=hub,
+ role=role,
+)
+
+# deploy model to SageMaker Inference
+predictor = huggingface_model.deploy(
+ initial_instance_count=1,
+ instance_type="ml.g5.2xlarge",
+ container_startup_health_check_timeout=300,
+  )
+
+# send request
+predictor.predict({
+"messages": [
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ]
+})
+```
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index c085943c..c546bc03 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -2,24 +2,27 @@
 
 The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).
 
-Let's say you want to deploy [Falcon-7B Instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) model with TGI. Here is an example on how to do that:
+## Launching TGI
+
+Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI on an Nvidia GPU. Here is an example on how to do that:
 
 ```bash
-model=tiiuae/falcon-7b-instruct
+model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.2 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data \
+    ghcr.io/huggingface/text-generation-inference:2.1.1 \
+    --model-id $model
 ```
 
-<Tip warning={true}>
+### Supported hardware
 
-To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)  . We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
+TGI supports various hardware. Make sure to check the [Using TGI with Nvidia GPUs](./installation_nvidia), [Using TGI with AMD GPUs](./installation_amd), [Using TGI with Gaudi](./installation_gaudi), [Using TGI with Inferentia](./installation_inferentia) guides depending on which hardware you would like to deploy TGI on.
 
-</Tip>
+## Consuming TGI
 
 Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.
 
-
 <inferencesnippet>
 <python>
 
@@ -47,7 +50,7 @@ print(response.json())
 ```js
 async function query() {
     const response = await fetch(
-        'http://127.0.0.1:8080/generate', 
+        'http://127.0.0.1:8080/generate',
         {
             method: 'POST',
             headers: { 'Content-Type': 'application/json'},
@@ -85,7 +88,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.
 
 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:1.0.2 --help
+docker run ghcr.io/huggingface/text-generation-inference:2.1.1 --help
 ```
 
 </Tip>
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index d997c8d8..2bdd00de 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -1,23 +1,38 @@
+
 # Supported Models and Hardware
 
 Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
 
 ## Supported Models
 
-The following models are optimized and can be served with TGI, which uses custom CUDA kernels for better inference. You can add the flag `--disable-custom-kernels` at the end of the `docker run` command if you wish to disable them.
-
-- [BLOOM](https://huggingface.co/bigscience/bloom)
-- [FLAN-T5](https://huggingface.co/google/flan-t5-xxl)
+- [Idefics 2](https://huggingface.co/HuggingFaceM4/idefics2-8b) (Multimodal)
+- [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
+- [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+- [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
+- [Gemma](https://huggingface.co/google/gemma-7b)
+- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
+- [Gemma2](https://huggingface.co/google/gemma2-9b)
+- [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
+- [Dbrx](https://huggingface.co/databricks/dbrx-instruct)
+- [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj)
+- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
+- [Gpt Bigcode](https://huggingface.co/bigcode/gpt_bigcode-santacoder)
+- [Phi](https://huggingface.co/microsoft/phi-1_5)
+- [Baichuan](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat)
+- [Falcon](https://huggingface.co/tiiuae/falcon-7b-instruct)
+- [StarCoder 2](https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1)
+- [Qwen 2](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f)
+- [Opt](https://huggingface.co/facebook/opt-6.7b)
+- [T5](https://huggingface.co/google/flan-t5-xxl)
 - [Galactica](https://huggingface.co/facebook/galactica-120b)
-- [GPT-Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
-- [Llama](https://github.com/facebookresearch/llama)
-- [OPT](https://huggingface.co/facebook/opt-66b)
 - [SantaCoder](https://huggingface.co/bigcode/santacoder)
-- [Starcoder](https://huggingface.co/bigcode/starcoder)
-- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
-- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)
-- [MPT](https://huggingface.co/mosaicml/mpt-30b)
-- [Llama V2](https://huggingface.co/meta-llama)
+- [Bloom](https://huggingface.co/bigscience/bloom-560m)
+- [Mpt](https://huggingface.co/mosaicml/mpt-7b-instruct)
+- [Gpt2](https://huggingface.co/openai-community/gpt2)
+- [Gpt Neox](https://huggingface.co/EleutherAI/gpt-neox-20b)
+- [Idefics](https://huggingface.co/HuggingFaceM4/idefics-9b) (Multimodal)
+
 
 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
 
@@ -28,13 +43,8 @@ AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
 AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")
 ```
 
+If you wish to serve a supported model that already exists on a local folder, just point to the local folder.
 
-## Supported Hardware
-
-TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other hardware, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. 
-
-TGI is also supported on the following AI hardware accelerators:
-- *Habana first-gen Gaudi and Gaudi2:* check out this [example](https://github.com/huggingface/optimum-habana/tree/main/text-generation-inference) how to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
-
-
-
+```bash
+text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
+```
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 3f7a24dd..f5f38ac6 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,29 +1,66 @@
-import sys
-import subprocess
-import contextlib
-import pytest
 import asyncio
-import os
-import docker
+import contextlib
 import json
 import math
-import time
+import os
 import random
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+from typing import Dict, List, Optional
 
-from docker.errors import NotFound
-from typing import Optional, List, Dict
-from syrupy.extensions.json import JSONSnapshotExtension
+import docker
+import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
-
+from docker.errors import NotFound
+from syrupy.extensions.json import JSONSnapshotExtension
 from text_generation import AsyncClient
-from text_generation.types import Response, Details, InputToken, Token, BestOfSequence
+from text_generation.types import (
+    BestOfSequence,
+    ChatComplete,
+    ChatCompletionChunk,
+    ChatCompletionComplete,
+    Completion,
+    Details,
+    Grammar,
+    InputToken,
+    Response,
+    Token,
+)
 
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
-HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--release", action="store_true", default=False, help="run release tests"
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "release: mark test as a release-only test")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--release"):
+        # --release given in cli: do not skip release tests
+        return
+    skip_release = pytest.mark.skip(reason="need --release option to run")
+    for item in items:
+        if "release" in item.keywords:
+            item.add_marker(skip_release)
 
 
 class ResponseComparator(JSONSnapshotExtension):
+    rtol = 0.2
+    ignore_logprob = False
+
     def serialize(
         self,
         data,
@@ -31,8 +68,16 @@ class ResponseComparator(JSONSnapshotExtension):
         exclude=None,
         matcher=None,
     ):
+        if (
+            isinstance(data, Response)
+            or isinstance(data, ChatComplete)
+            or isinstance(data, ChatCompletionChunk)
+            or isinstance(data, ChatCompletionComplete)
+        ):
+            data = data.model_dump()
+
         if isinstance(data, List):
-            data = [d.dict() for d in data]
+            data = [d.model_dump() for d in data]
 
         data = self._filter(
             data=data, depth=0, path=(), exclude=exclude, matcher=matcher
@@ -47,10 +92,24 @@ class ResponseComparator(JSONSnapshotExtension):
     ) -> bool:
         def convert_data(data):
             data = json.loads(data)
+            if isinstance(data, Dict) and "choices" in data:
+                choices = data["choices"]
+                if isinstance(choices, List) and len(choices) >= 1:
+                    if "delta" in choices[0]:
+                        return ChatCompletionChunk(**data)
+                    if "text" in choices[0]:
+                        return Completion(**data)
+                return ChatComplete(**data)
 
             if isinstance(data, Dict):
                 return Response(**data)
             if isinstance(data, List):
+                if (
+                    len(data) > 0
+                    and "object" in data[0]
+                    and data[0]["object"] == "text_completion"
+                ):
+                    return [Completion(**d) for d in data]
                 return [Response(**d) for d in data]
             raise NotImplementedError
 
@@ -58,7 +117,10 @@ class ResponseComparator(JSONSnapshotExtension):
             return (
                 token.id == other.id
                 and token.text == other.text
-                and math.isclose(token.logprob, other.logprob, rel_tol=0.2)
+                and (
+                    self.ignore_logprob
+                    or math.isclose(token.logprob, other.logprob, rel_tol=self.rtol)
+                )
                 and token.special == other.special
             )
 
@@ -68,7 +130,12 @@ class ResponseComparator(JSONSnapshotExtension):
                     prefill_token.id == other.id
                     and prefill_token.text == other.text
                     and (
-                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=0.2)
+                        self.ignore_logprob
+                        or math.isclose(
+                            prefill_token.logprob,
+                            other.logprob,
+                            rel_tol=self.rtol,
+                        )
                         if prefill_token.logprob is not None
                         else prefill_token.logprob == other.logprob
                     )
@@ -130,6 +197,19 @@ class ResponseComparator(JSONSnapshotExtension):
                 )
             )
 
+        def eq_completion(response: Completion, other: Completion) -> bool:
+            return response.choices[0].text == other.choices[0].text
+
+        def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
+            return (
+                response.choices[0].message.content == other.choices[0].message.content
+            )
+
+        def eq_chat_complete_chunk(
+            response: ChatCompletionChunk, other: ChatCompletionChunk
+        ) -> bool:
+            return response.choices[0].delta.content == other.choices[0].delta.content
+
         def eq_response(response: Response, other: Response) -> bool:
             return response.generated_text == other.generated_text and eq_details(
                 response.details, other.details
@@ -143,11 +223,38 @@ class ResponseComparator(JSONSnapshotExtension):
         if not isinstance(snapshot_data, List):
             snapshot_data = [snapshot_data]
 
+        if isinstance(serialized_data[0], Completion):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
+        if isinstance(serialized_data[0], ChatComplete):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
+        if isinstance(serialized_data[0], ChatCompletionChunk):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [
+                    eq_chat_complete_chunk(r, o)
+                    for r, o in zip(serialized_data, snapshot_data)
+                ]
+            )
+
         return len(snapshot_data) == len(serialized_data) and all(
             [eq_response(r, o) for r, o in zip(serialized_data, snapshot_data)]
         )
 
 
+class GenerousResponseComparator(ResponseComparator):
+    # Needed for GPTQ with exllama which has serious numerical fluctuations.
+    rtol = 0.75
+
+
+class IgnoreLogProbResponseComparator(ResponseComparator):
+    ignore_logprob = True
+
+
 class LauncherHandle:
     def __init__(self, port: int):
         self.client = AsyncClient(f"http://localhost:{port}")
@@ -194,6 +301,16 @@ def response_snapshot(snapshot):
     return snapshot.use_extension(ResponseComparator)
 
 
+@pytest.fixture
+def generous_response_snapshot(snapshot):
+    return snapshot.use_extension(GenerousResponseComparator)
+
+
+@pytest.fixture
+def ignore_logprob_response_snapshot(snapshot):
+    return snapshot.use_extension(IgnoreLogProbResponseComparator)
+
+
 @pytest.fixture(scope="module")
 def event_loop():
     loop = asyncio.get_event_loop()
@@ -210,6 +327,12 @@ def launcher(event_loop):
         quantize: Optional[str] = None,
         trust_remote_code: bool = False,
         use_flash_attention: bool = True,
+        disable_grammar_support: bool = False,
+        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
+        max_input_length: Optional[int] = None,
+        max_batch_prefill_tokens: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
     ):
         port = random.randint(8000, 10_000)
         master_port = random.randint(10_000, 20_000)
@@ -232,32 +355,52 @@ def launcher(event_loop):
 
         env = os.environ
 
+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
         if num_shard is not None:
             args.extend(["--num-shard", str(num_shard)])
         if quantize is not None:
             args.append("--quantize")
             args.append(quantize)
+        if dtype is not None:
+            args.append("--dtype")
+            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
         if trust_remote_code:
             args.append("--trust-remote-code")
+        if max_input_length:
+            args.append("--max-input-length")
+            args.append(str(max_input_length))
+        if max_batch_prefill_tokens:
+            args.append("--max-batch-prefill-tokens")
+            args.append(str(max_batch_prefill_tokens))
+        if max_total_tokens:
+            args.append("--max-total-tokens")
+            args.append(str(max_total_tokens))
 
         env["LOG_LEVEL"] = "info,text_generation_router=debug"
 
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
 
-        with subprocess.Popen(
-            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
-        ) as process:
-            yield ProcessLauncherHandle(process, port)
+        with tempfile.TemporaryFile("w+") as tmp:
+            # We'll output stdout/stderr to a temporary file. Using a pipe
+            # cause the process to block until stdout is read.
+            with subprocess.Popen(
+                args,
+                stdout=tmp,
+                stderr=subprocess.STDOUT,
+                env=env,
+            ) as process:
+                yield ProcessLauncherHandle(process, port)
 
-            process.terminate()
-            process.wait(60)
+                process.terminate()
+                process.wait(60)
 
-            launcher_output = process.stdout.read().decode("utf-8")
-            print(launcher_output, file=sys.stderr)
-
-            process.stdout.close()
-            process.stderr.close()
+                tmp.seek(0)
+                shutil.copyfileobj(tmp, sys.stderr)
 
         if not use_flash_attention:
             del env["USE_FLASH_ATTENTION"]
@@ -269,18 +412,41 @@ def launcher(event_loop):
         quantize: Optional[str] = None,
         trust_remote_code: bool = False,
         use_flash_attention: bool = True,
+        disable_grammar_support: bool = False,
+        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
+        max_input_length: Optional[int] = None,
+        max_batch_prefill_tokens: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
     ):
         port = random.randint(8000, 10_000)
 
         args = ["--model-id", model_id, "--env"]
 
+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
         if num_shard is not None:
             args.extend(["--num-shard", str(num_shard)])
         if quantize is not None:
             args.append("--quantize")
             args.append(quantize)
+        if dtype is not None:
+            args.append("--dtype")
+            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
         if trust_remote_code:
             args.append("--trust-remote-code")
+        if max_input_length:
+            args.append("--max-input-length")
+            args.append(str(max_input_length))
+        if max_batch_prefill_tokens:
+            args.append("--max-batch-prefill-tokens")
+            args.append(str(max_batch_prefill_tokens))
+        if max_total_tokens:
+            args.append("--max-total-tokens")
+            args.append(str(max_total_tokens))
 
         client = docker.from_env()
 
@@ -295,17 +461,31 @@ def launcher(event_loop):
 
         gpu_count = num_shard if num_shard is not None else 1
 
-        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+        env = {
+            "LOG_LEVEL": "info,text_generation_router=debug",
+        }
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
 
-        if HUGGING_FACE_HUB_TOKEN is not None:
-            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+        if HF_TOKEN is not None:
+            env["HF_TOKEN"] = HF_TOKEN
 
         volumes = []
         if DOCKER_VOLUME:
             volumes = [f"{DOCKER_VOLUME}:/data"]
 
+        if DOCKER_DEVICES:
+            devices = DOCKER_DEVICES.split(",")
+            visible = os.getenv("ROCR_VISIBLE_DEVICES")
+            if visible:
+                env["ROCR_VISIBLE_DEVICES"] = visible
+            device_requests = []
+        else:
+            devices = []
+            device_requests = [
+                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
+            ]
+
         container = client.containers.run(
             DOCKER_IMAGE,
             command=args,
@@ -313,11 +493,11 @@ def launcher(event_loop):
             environment=env,
             auto_remove=False,
             detach=True,
-            device_requests=[
-                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
-            ],
+            device_requests=device_requests,
+            devices=devices,
             volumes=volumes,
             ports={"80/tcp": port},
+            shm_size="1G",
         )
 
         yield ContainerLauncherHandle(client, container.name, port)
@@ -344,11 +524,22 @@ def launcher(event_loop):
 @pytest.fixture(scope="module")
 def generate_load():
     async def generate_load_inner(
-        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+        client: AsyncClient,
+        prompt: str,
+        max_new_tokens: int,
+        n: int,
+        seed: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+        stop_sequences: Optional[List[str]] = None,
     ) -> List[Response]:
         futures = [
             client.generate(
-                prompt, max_new_tokens=max_new_tokens, decoder_input_details=True
+                prompt,
+                max_new_tokens=max_new_tokens,
+                decoder_input_details=True,
+                seed=seed,
+                grammar=grammar,
+                stop_sequences=stop_sequences,
             )
             for _ in range(n)
         ]
diff --git a/integration-tests/images/chicken_on_money.png b/integration-tests/images/chicken_on_money.png
new file mode 100644
index 00000000..1a4e0440
Binary files /dev/null and b/integration-tests/images/chicken_on_money.png differ
diff --git a/integration-tests/images/cow_beach.png b/integration-tests/images/cow_beach.png
new file mode 100644
index 00000000..d67f8a1b
Binary files /dev/null and b/integration-tests/images/cow_beach.png differ
diff --git a/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json b/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json
new file mode 100644
index 00000000..8631c076
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_chat_llama/test_flash_llama_simple.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to explore in the middle of urban confines. In fact, typical times for humidity levels in Brooklyn include:\n\n- Early morning: 80-85% humidity, with occas",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1716553098,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.5-dev0-native",
+  "usage": {
+    "completion_tokens": 100,
+    "prompt_tokens": 62,
+    "total_tokens": 162
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
new file mode 100644
index 00000000..99c33cf7
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts.json
@@ -0,0 +1,38 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 1,
+      "logprobs": null,
+      "text": " PR for more information?"
+    },
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": "le Business Incubator is providing a workspace"
+    },
+    {
+      "finish_reason": "length",
+      "index": 2,
+      "logprobs": null,
+      "text": " severely flawed and often has a substandard"
+    },
+    {
+      "finish_reason": "length",
+      "index": 3,
+      "logprobs": null,
+      "text": "hd20220811-"
+    }
+  ],
+  "created": 1713284455,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 36,
+    "prompt_tokens": 8,
+    "total_tokens": 44
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
new file mode 100644
index 00000000..d87071cf
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_many_prompts_stream.json
@@ -0,0 +1,602 @@
+[
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "hd"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "aho"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "2"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "2"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "2"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "ima"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "."
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "."
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": "."
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "\n"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " Sarah"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " Yes"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " And"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "i"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "'"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": ","
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " what"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "'"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": "s"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " Moh"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " is"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": "m"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " Room"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "s"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " the"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": " tired"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": ":"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": "'"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " capital"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": " of"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 0,
+        "logprobs": null,
+        "text": " She"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 1,
+        "logprobs": null,
+        "text": " scale"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 2,
+        "logprobs": null,
+        "text": " of"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  },
+  {
+    "choices": [
+      {
+        "finish_reason": "",
+        "index": 3,
+        "logprobs": null,
+        "text": " being"
+      }
+    ],
+    "created": 1713284431,
+    "id": "",
+    "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "object": "text_completion",
+    "system_fingerprint": "2.0.1-native"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
new file mode 100644
index 00000000..5aed4935
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_single_prompt.json
@@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "text": " PR for flake8"
+    }
+  ],
+  "created": 1713284454,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 5,
+    "prompt_tokens": 6,
+    "total_tokens": 11
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
new file mode 100644
index 00000000..dcd37cb9
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -7.703125,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.4765625,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.390625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.8583984,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.7548828,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9306641,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 5618,
+        "logprob": -2.4550781,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.5732422,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 278,
+        "logprob": -1.5761719,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 4328,
+        "logprob": -1.5888672,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 1546,
+        "logprob": -0.026504517,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 21784,
+        "logprob": -1.4287109,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -0.15856934,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 322,
+        "logprob": -0.17456055,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 6189,
+        "logprob": -0.62646484,
+        "special": false,
+        "text": " Machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
new file mode 100644
index 00000000..d16d34f9
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 338,
+        "logprob": -9.0859375,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -10.90625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -2.65625,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -4.8085938,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.19958496,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 4013,
+        "logprob": -2.203125,
+        "special": false,
+        "text": "This"
+      },
+      {
+        "id": 1139,
+        "logprob": -0.23693848,
+        "special": false,
+        "text": " question"
+      },
+      {
+        "id": 756,
+        "logprob": 0.0,
+        "special": false,
+        "text": " has"
+      },
+      {
+        "id": 1063,
+        "logprob": -0.076538086,
+        "special": false,
+        "text": " been"
+      },
+      {
+        "id": 4433,
+        "logprob": 0.0,
+        "special": false,
+        "text": " asked"
+      },
+      {
+        "id": 1784,
+        "logprob": -1.1367188,
+        "special": false,
+        "text": " many"
+      },
+      {
+        "id": 3064,
+        "logprob": 0.0,
+        "special": false,
+        "text": " times"
+      },
+      {
+        "id": 322,
+        "logprob": -1.7460938,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 306,
+        "logprob": 0.0,
+        "special": false,
+        "text": " I"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "What is Deep Learning?\nThis question has been asked many times and I"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
new file mode 100644
index 00000000..e6fb3dc0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8583984,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.703125,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4765625,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8652344,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7548828,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9306641,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4550781,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.5732422,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5761719,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5888672,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.026504517,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4287109,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15856934,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62646484,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
new file mode 100644
index 00000000..f1d9129d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_load_sharded.json
@@ -0,0 +1,418 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -7.6914062,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.4746094,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.390625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.8623047,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.7558594,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.9228516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 5618,
+          "logprob": -2.4609375,
+          "special": false,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -0.57177734,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 278,
+          "logprob": -1.5722656,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 4328,
+          "logprob": -1.5859375,
+          "special": false,
+          "text": " difference"
+        },
+        {
+          "id": 1546,
+          "logprob": -0.02633667,
+          "special": false,
+          "text": " between"
+        },
+        {
+          "id": 21784,
+          "logprob": -1.4335938,
+          "special": false,
+          "text": " Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -0.15991211,
+          "special": false,
+          "text": " Learning"
+        },
+        {
+          "id": 322,
+          "logprob": -0.17456055,
+          "special": false,
+          "text": " and"
+        },
+        {
+          "id": 6189,
+          "logprob": -0.62060547,
+          "special": false,
+          "text": " Machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
new file mode 100644
index 00000000..0f91eb36
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq_sharded/test_flash_llama_awq_sharded.json
@@ -0,0 +1,104 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -7.6914062,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.4746094,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.390625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.8623047,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.7558594,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9228516,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 5618,
+        "logprob": -2.4609375,
+        "special": false,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -0.57177734,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 278,
+        "logprob": -1.5722656,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 4328,
+        "logprob": -1.5927734,
+        "special": false,
+        "text": " difference"
+      },
+      {
+        "id": 1546,
+        "logprob": -0.026428223,
+        "special": false,
+        "text": " between"
+      },
+      {
+        "id": 21784,
+        "logprob": -1.4267578,
+        "special": false,
+        "text": " Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -0.16015625,
+        "special": false,
+        "text": " Learning"
+      },
+      {
+        "id": 322,
+        "logprob": -0.17382812,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 6189,
+        "logprob": -0.62060547,
+        "special": false,
+        "text": " Machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nWhat is the difference between Deep Learning and Machine"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma.json b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma.json
new file mode 100644
index 00000000..80f0d053
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -10.0,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1736,
+        "logprob": -2.09375,
+        "special": false,
+        "text": " form"
+      },
+      {
+        "id": 109,
+        "logprob": -1.8671875,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 651,
+        "logprob": -2.4375,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 2121,
+        "logprob": -1.8203125,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 3853,
+        "logprob": -0.23242188,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 1736,
+        "logprob": -0.08544922,
+        "special": false,
+        "text": " form"
+      },
+      {
+        "id": 603,
+        "logprob": -0.9375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1671,
+        "logprob": -1.671875,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 577,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3853,
+        "logprob": -1.1875,
+        "special": false,
+        "text": " request"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " form\n\nThe test request form is used to request"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json
new file mode 100644
index 00000000..8253dc96
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -10.0,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 7539,
+        "logprob": -0.73046875,
+        "special": false,
+        "text": " forms"
+      },
+      {
+        "id": 708,
+        "logprob": 0.0,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 671,
+        "logprob": -1.703125,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 8727,
+        "logprob": 0.0,
+        "special": false,
+        "text": " essential"
+      },
+      {
+        "id": 1702,
+        "logprob": 0.0,
+        "special": false,
+        "text": " part"
+      },
+      {
+        "id": 576,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 573,
+        "logprob": 0.0,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 11859,
+        "logprob": -1.6953125,
+        "special": false,
+        "text": " lab"
+      },
+      {
+        "id": 2185,
+        "logprob": -1.3125,
+        "special": false,
+        "text": " process"
+      },
+      {
+        "id": 578,
+        "logprob": -1.5,
+        "special": false,
+        "text": " and"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request forms are an essential part of the lab process and"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_load.json b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_load.json
new file mode 100644
index 00000000..e69ee25d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma/test_flash_gemma_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq.json b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq.json
new file mode 100644
index 00000000..760ebf94
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -9.640625,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.34375,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 604,
+        "logprob": -2.4296875,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 573,
+        "logprob": -2.4453125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2412,
+        "logprob": -2.8632812,
+        "special": false,
+        "text": " following"
+      },
+      {
+        "id": 235292,
+        "logprob": -2.1328125,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 109,
+        "logprob": -0.76660156,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 235287,
+        "logprob": -1.3837891,
+        "special": false,
+        "text": "*"
+      },
+      {
+        "id": 235248,
+        "logprob": -1.9746094,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 199,
+        "logprob": -1.4189453,
+        "special": false,
+        "text": "<strong>"
+      },
+      {
+        "id": 1232,
+        "logprob": -4.34375,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 208,
+        "logprob": -0.8852539,
+        "special": false,
+        "text": "</strong>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " for the following:\n\n* <strong>Name</strong>"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
new file mode 100644
index 00000000..7a168b2e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -9.65625,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.3671875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 604,
+        "logprob": -0.36938477,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 235248,
+        "logprob": -1.8046875,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 235274,
+        "logprob": -0.46240234,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 235284,
+        "logprob": -1.7460938,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 235265,
+        "logprob": -1.9443359,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 235284,
+        "logprob": -1.4550781,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 235308,
+        "logprob": -1.0205078,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 235290,
+        "logprob": -1.0283203,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 235274,
+        "logprob": -1.2783203,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 235284,
+        "logprob": 0.0,
+        "special": false,
+        "text": "2"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request for 12.25-12"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_load.json
new file mode 100644
index 00000000..bcb9b378
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gemma_gptq/test_flash_gemma_gptq_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -9.6484375,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.359375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 604,
+          "logprob": -2.4277344,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 573,
+          "logprob": -2.4394531,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 2412,
+          "logprob": -2.8613281,
+          "special": false,
+          "text": " following"
+        },
+        {
+          "id": 235292,
+          "logprob": -2.1523438,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 109,
+          "logprob": -0.76220703,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 235287,
+          "logprob": -1.3642578,
+          "special": false,
+          "text": "*"
+        },
+        {
+          "id": 235248,
+          "logprob": -2.0175781,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 199,
+          "logprob": -1.4238281,
+          "special": false,
+          "text": "<strong>"
+        },
+        {
+          "id": 1232,
+          "logprob": -4.328125,
+          "special": false,
+          "text": "Name"
+        },
+        {
+          "id": 208,
+          "logprob": -0.8881836,
+          "special": false,
+          "text": "</strong>"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the following:\n\n* <strong>Name</strong>"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -9.6484375,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.34375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 604,
+          "logprob": -2.4238281,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 573,
+          "logprob": -2.4453125,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 2412,
+          "logprob": -2.859375,
+          "special": false,
+          "text": " following"
+        },
+        {
+          "id": 235292,
+          "logprob": -2.1445312,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 109,
+          "logprob": -0.7631836,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 235287,
+          "logprob": -1.3642578,
+          "special": false,
+          "text": "*"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.9960938,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 199,
+          "logprob": -1.4179688,
+          "special": false,
+          "text": "<strong>"
+        },
+        {
+          "id": 1232,
+          "logprob": -4.3359375,
+          "special": false,
+          "text": "Name"
+        },
+        {
+          "id": 208,
+          "logprob": -0.8847656,
+          "special": false,
+          "text": "</strong>"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the following:\n\n* <strong>Name</strong>"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -9.640625,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.3671875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 604,
+          "logprob": -2.4257812,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 573,
+          "logprob": -2.4453125,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 2412,
+          "logprob": -2.8789062,
+          "special": false,
+          "text": " following"
+        },
+        {
+          "id": 235292,
+          "logprob": -2.1367188,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 109,
+          "logprob": -0.76171875,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 235287,
+          "logprob": -1.3515625,
+          "special": false,
+          "text": "*"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.9873047,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 199,
+          "logprob": -1.4169922,
+          "special": false,
+          "text": "<strong>"
+        },
+        {
+          "id": 1232,
+          "logprob": -4.3320312,
+          "special": false,
+          "text": "Name"
+        },
+        {
+          "id": 208,
+          "logprob": -0.8930664,
+          "special": false,
+          "text": "</strong>"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the following:\n\n* <strong>Name</strong>"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -9.6484375,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.359375,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 604,
+          "logprob": -2.4179688,
+          "special": false,
+          "text": " for"
+        },
+        {
+          "id": 573,
+          "logprob": -2.4492188,
+          "special": false,
+          "text": " the"
+        },
+        {
+          "id": 2412,
+          "logprob": -2.8574219,
+          "special": false,
+          "text": " following"
+        },
+        {
+          "id": 235292,
+          "logprob": -2.1445312,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 109,
+          "logprob": -0.7519531,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 235287,
+          "logprob": -1.3623047,
+          "special": false,
+          "text": "*"
+        },
+        {
+          "id": 235248,
+          "logprob": -1.9707031,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 199,
+          "logprob": -1.4267578,
+          "special": false,
+          "text": "<strong>"
+        },
+        {
+          "id": 1232,
+          "logprob": -4.3359375,
+          "special": false,
+          "text": "Name"
+        },
+        {
+          "id": 208,
+          "logprob": -0.88427734,
+          "special": false,
+          "text": "</strong>"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " for the following:\n\n* <strong>Name</strong>"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json
new file mode 100644
index 00000000..ca7393a3
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2061,
+        "logprob": null,
+        "text": "What"
+      },
+      {
+        "id": 318,
+        "logprob": -3.1835938,
+        "text": " is"
+      },
+      {
+        "id": 2769,
+        "logprob": -9.171875,
+        "text": " deep"
+      },
+      {
+        "id": 4673,
+        "logprob": -1.6425781,
+        "text": " learning"
+      },
+      {
+        "id": 30,
+        "logprob": -0.7314453,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 198,
+        "logprob": -0.68603516,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 198,
+        "logprob": -0.005393982,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 29744,
+        "logprob": -0.31079102,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 4673,
+        "logprob": -0.08300781,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 318,
+        "logprob": -0.58984375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 257,
+        "logprob": -0.953125,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 649,
+        "logprob": -2.0957031,
+        "special": false,
+        "text": " new"
+      },
+      {
+        "id": 2214,
+        "logprob": -1.8095703,
+        "special": false,
+        "text": " field"
+      },
+      {
+        "id": 286,
+        "logprob": -1.0673828,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 2267,
+        "logprob": -0.9375,
+        "special": false,
+        "text": " research"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a new field of research"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json
new file mode 100644
index 00000000..7bd15b90
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_gpt2/test_flash_gpt2_load.json
@@ -0,0 +1,398 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2061,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 318,
+          "logprob": -3.1835938,
+          "text": " is"
+        },
+        {
+          "id": 2769,
+          "logprob": -9.171875,
+          "text": " deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -1.6425781,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -0.7314453,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -0.68603516,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 198,
+          "logprob": -0.005672455,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29744,
+          "logprob": -0.3251953,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -0.08294678,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 318,
+          "logprob": -0.5854492,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 257,
+          "logprob": -0.9423828,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 649,
+          "logprob": -2.0800781,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 2214,
+          "logprob": -1.8369141,
+          "special": false,
+          "text": " field"
+        },
+        {
+          "id": 286,
+          "logprob": -1.0683594,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2267,
+          "logprob": -0.9711914,
+          "special": false,
+          "text": " research"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new field of research"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2061,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 318,
+          "logprob": -3.1660156,
+          "text": " is"
+        },
+        {
+          "id": 2769,
+          "logprob": -9.1796875,
+          "text": " deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -1.6376953,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -0.72216797,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -0.7089844,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 198,
+          "logprob": -0.0054779053,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29744,
+          "logprob": -0.3190918,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -0.08319092,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 318,
+          "logprob": -0.5839844,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 257,
+          "logprob": -0.9506836,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 649,
+          "logprob": -2.0878906,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 2214,
+          "logprob": -1.8496094,
+          "special": false,
+          "text": " field"
+        },
+        {
+          "id": 286,
+          "logprob": -1.0673828,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2267,
+          "logprob": -0.9370117,
+          "special": false,
+          "text": " research"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new field of research"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2061,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 318,
+          "logprob": -3.1660156,
+          "text": " is"
+        },
+        {
+          "id": 2769,
+          "logprob": -9.1796875,
+          "text": " deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -1.6376953,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -0.72216797,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -0.7089844,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 198,
+          "logprob": -0.0054779053,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29744,
+          "logprob": -0.3190918,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -0.08319092,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 318,
+          "logprob": -0.5839844,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 257,
+          "logprob": -0.9506836,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 649,
+          "logprob": -2.0878906,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 2214,
+          "logprob": -1.8496094,
+          "special": false,
+          "text": " field"
+        },
+        {
+          "id": 286,
+          "logprob": -1.0673828,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2267,
+          "logprob": -0.9370117,
+          "special": false,
+          "text": " research"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new field of research"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2061,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 318,
+          "logprob": -3.1660156,
+          "text": " is"
+        },
+        {
+          "id": 2769,
+          "logprob": -9.1796875,
+          "text": " deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -1.6376953,
+          "text": " learning"
+        },
+        {
+          "id": 30,
+          "logprob": -0.72216797,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -0.7089844,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 198,
+          "logprob": -0.0054779053,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29744,
+          "logprob": -0.3190918,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4673,
+          "logprob": -0.08319092,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 318,
+          "logprob": -0.5839844,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 257,
+          "logprob": -0.9506836,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 649,
+          "logprob": -2.0878906,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 2214,
+          "logprob": -1.8496094,
+          "special": false,
+          "text": " field"
+        },
+        {
+          "id": 286,
+          "logprob": -1.0673828,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2267,
+          "logprob": -0.9370117,
+          "special": false,
+          "text": " research"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new field of research"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar.json b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar.json
new file mode 100644
index 00000000..0e87f59e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -13.90625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -12.328125,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.0566406,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.5253906,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 29902,
+        "logprob": -2.7578125,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 4966,
+        "logprob": -1.9033203,
+        "special": false,
+        "text": " hope"
+      },
+      {
+        "id": 445,
+        "logprob": -0.5019531,
+        "special": false,
+        "text": " this"
+      },
+      {
+        "id": 6911,
+        "logprob": -0.21264648,
+        "special": false,
+        "text": " helps"
+      },
+      {
+        "id": 29991,
+        "logprob": -0.5991211,
+        "special": false,
+        "text": "!"
+      },
+      {
+        "id": 2803,
+        "logprob": -0.37475586,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 592,
+        "logprob": -0.018463135,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 1073,
+        "logprob": -0.0008597374,
+        "special": false,
+        "text": " know"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nI hope this helps! Let me know"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_json.json b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_json.json
new file mode 100644
index 00000000..d7fb620d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_json.json
@@ -0,0 +1,274 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 30,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 5235,
+        "logprob": -10.0625,
+        "text": "info"
+      },
+      {
+        "id": 29901,
+        "logprob": -3.2324219,
+        "text": ":"
+      },
+      {
+        "id": 13260,
+        "logprob": -10.625,
+        "text": "dav"
+      },
+      {
+        "id": 333,
+        "logprob": -0.08276367,
+        "text": "id"
+      },
+      {
+        "id": 8753,
+        "logprob": -7.5273438,
+        "text": "hol"
+      },
+      {
+        "id": 17559,
+        "logprob": -3.8476562,
+        "text": "tz"
+      },
+      {
+        "id": 763,
+        "logprob": -10.140625,
+        "text": "like"
+      },
+      {
+        "id": 10697,
+        "logprob": -10.1953125,
+        "text": "trees"
+      },
+      {
+        "id": 322,
+        "logprob": -2.5742188,
+        "text": "and"
+      },
+      {
+        "id": 756,
+        "logprob": -7.4882812,
+        "text": "has"
+      },
+      {
+        "id": 1023,
+        "logprob": -5.0507812,
+        "text": "two"
+      },
+      {
+        "id": 274,
+        "logprob": -5.3164062,
+        "text": "c"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.6694336,
+        "text": "ats"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.9995117,
+        "text": "."
+      },
+      {
+        "id": 29871,
+        "logprob": -4.2421875,
+        "text": ""
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 6377,
+        "logprob": -0.14916992,
+        "special": false,
+        "text": "{\""
+      },
+      {
+        "id": 29888,
+        "logprob": -0.13598633,
+        "special": false,
+        "text": "f"
+      },
+      {
+        "id": 12935,
+        "logprob": -0.017669678,
+        "special": false,
+        "text": "irs"
+      },
+      {
+        "id": 29873,
+        "logprob": -0.00085639954,
+        "special": false,
+        "text": "t"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.0054016113,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.13549805,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 19504,
+        "logprob": -0.8852539,
+        "special": false,
+        "text": "David"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.16394043,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29882,
+        "logprob": -0.08862305,
+        "special": false,
+        "text": "h"
+      },
+      {
+        "id": 711,
+        "logprob": -0.66259766,
+        "special": false,
+        "text": "ob"
+      },
+      {
+        "id": 1609,
+        "logprob": -5.51939e-05,
+        "special": false,
+        "text": "by"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.23120117,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29911,
+        "logprob": -2.3730469,
+        "special": false,
+        "text": "T"
+      },
+      {
+        "id": 11003,
+        "logprob": -0.032104492,
+        "special": false,
+        "text": "rees"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.22021484,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 4230,
+        "logprob": -0.06726074,
+        "special": false,
+        "text": "last"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.003501892,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.0045661926,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29950,
+        "logprob": -0.12512207,
+        "special": false,
+        "text": "H"
+      },
+      {
+        "id": 14339,
+        "logprob": -0.009552002,
+        "special": false,
+        "text": "olt"
+      },
+      {
+        "id": 29920,
+        "logprob": -0.00042438507,
+        "special": false,
+        "text": "z"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.11651611,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29876,
+        "logprob": -0.29736328,
+        "special": false,
+        "text": "n"
+      },
+      {
+        "id": 398,
+        "logprob": -0.003030777,
+        "special": false,
+        "text": "um"
+      },
+      {
+        "id": 29907,
+        "logprob": -0.3774414,
+        "special": false,
+        "text": "C"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.0003130436,
+        "special": false,
+        "text": "ats"
+      },
+      {
+        "id": 1115,
+        "logprob": -0.0021514893,
+        "special": false,
+        "text": "\":"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.071899414,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29913,
+        "logprob": -0.018997192,
+        "special": false,
+        "text": "}"
+      },
+      {
+        "id": 2,
+        "logprob": 0.0,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "{\"firstName\":\"David\",\"hobby\":\"Trees\",\"lastName\":\"Holtz\",\"numCats\":2}"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_load.json b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_load.json
new file mode 100644
index 00000000..411f3947
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_load.json
@@ -0,0 +1,478 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.0332031,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04257202,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4785156,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32495117,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7709961,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33740234,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.00995636,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64208984,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.4970703,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46533203,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5336914,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022735596,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.03125,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04244995,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4863281,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32714844,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7685547,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33666992,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.01008606,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64160156,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.5,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46557617,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5341797,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022907257,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.0332031,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04257202,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4785156,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32495117,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7709961,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33740234,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.00995636,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64208984,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.4970703,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46533203,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5336914,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022735596,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.0332031,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04257202,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4785156,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32495117,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7709961,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33740234,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.00995636,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64208984,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.4970703,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46533203,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5336914,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022735596,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_regex.json b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_regex.json
new file mode 100644
index 00000000..1ba9ae1e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_regex.json
@@ -0,0 +1,109 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 806,
+        "logprob": -11.890625,
+        "text": "Wh"
+      },
+      {
+        "id": 1446,
+        "logprob": -3.6699219,
+        "text": "ats"
+      },
+      {
+        "id": 2921,
+        "logprob": -7.8203125,
+        "text": "Go"
+      },
+      {
+        "id": 468,
+        "logprob": -8.0703125,
+        "text": "og"
+      },
+      {
+        "id": 793,
+        "logprob": -2.1875,
+        "text": "les"
+      },
+      {
+        "id": 16332,
+        "logprob": -9.7109375,
+        "text": "DNS"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29946,
+        "logprob": -1.4765625,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.9199219,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29889,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 29896,
+        "logprob": -1.1367188,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29889,
+        "logprob": -1.4648438,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 29896,
+        "logprob": -0.40722656,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.17419434,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 29896,
+        "logprob": -0.20251465,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29900,
+        "logprob": -1.5527344,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 29896,
+        "logprob": -1.3710938,
+        "special": false,
+        "text": "1"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "42.1.1.101"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_single_load_instance.json b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_single_load_instance.json
new file mode 100644
index 00000000..7ffb17cb
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_grammar_llama/test_flash_llama_grammar_single_load_instance.json
@@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29896,
+        "logprob": -0.7685547,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.33666992,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29941,
+        "logprob": -0.009979248,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 29946,
+        "logprob": -0.64208984,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 29945,
+        "logprob": -0.4970703,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 29953,
+        "logprob": -0.46533203,
+        "special": false,
+        "text": "6"
+      },
+      {
+        "id": 29992,
+        "logprob": -0.5336914,
+        "special": false,
+        "text": "@"
+      },
+      {
+        "id": 21980,
+        "logprob": -0.53759766,
+        "special": false,
+        "text": "gmail"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.0008878708,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 510,
+        "logprob": -0.002275467,
+        "special": false,
+        "text": "com"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "123456@gmail.com"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
index 49bc996c..a7f7d2f0 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama.json
@@ -16,7 +16,7 @@
       },
       {
         "id": 2009,
-        "logprob": -11.5546875,
+        "logprob": -11.546875,
         "text": "request"
       }
     ],
@@ -24,65 +24,66 @@
     "tokens": [
       {
         "id": 363,
-        "logprob": -1.5380859,
+        "logprob": -1.5351562,
         "special": false,
         "text": " for"
       },
       {
         "id": 847,
-        "logprob": -2.5917969,
+        "logprob": -2.5722656,
         "special": false,
         "text": " /"
       },
       {
         "id": 2754,
-        "logprob": -2.2773438,
+        "logprob": -2.2714844,
         "special": false,
         "text": "api"
       },
       {
         "id": 29914,
-        "logprob": -0.034362793,
+        "logprob": -0.03414917,
         "special": false,
         "text": "/"
       },
       {
         "id": 29894,
-        "logprob": -0.96533203,
+        "logprob": -0.95996094,
         "special": false,
         "text": "v"
       },
       {
         "id": 29896,
-        "logprob": -0.36669922,
+        "logprob": -0.3635254,
         "special": false,
         "text": "1"
       },
       {
         "id": 29914,
-        "logprob": -0.013122559,
+        "logprob": -0.013031006,
         "special": false,
         "text": "/"
       },
       {
         "id": 16418,
-        "logprob": -3.1503906,
+        "logprob": -3.1523438,
         "special": false,
         "text": "projects"
       },
       {
         "id": 29914,
-        "logprob": -0.43652344,
+        "logprob": -0.43701172,
         "special": false,
         "text": "/"
       },
       {
         "id": 29896,
-        "logprob": -1.9404297,
+        "logprob": -1.9394531,
         "special": false,
         "text": "1"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "for /api/v1/projects/1"
+  "generated_text": " for /api/v1/projects/1"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
index 5be2870d..9f145377 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_all_params.json
@@ -16,7 +16,7 @@
       },
       {
         "id": 2009,
-        "logprob": -11.5546875,
+        "logprob": -11.546875,
         "text": "request"
       }
     ],
@@ -24,19 +24,19 @@
     "tokens": [
       {
         "id": 5229,
-        "logprob": -2.5683594,
+        "logprob": -2.5839844,
         "special": false,
         "text": " failed"
       },
       {
         "id": 29901,
-        "logprob": -0.45336914,
+        "logprob": -0.44970703,
         "special": false,
         "text": ":"
       },
       {
         "id": 4829,
-        "logprob": -1.8408203,
+        "logprob": -1.8339844,
         "special": false,
         "text": " Error"
       },
@@ -52,7 +52,8 @@
         "special": false,
         "text": " test"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "Test requestfailed: Error in test"
+  "generated_text": "Test request failed: Error in test"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
index 9bbb5322..3543dad2 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama/test_flash_llama_load.json
@@ -17,7 +17,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -25,25 +25,25 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5380859,
+          "logprob": -1.5351562,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5859375,
+          "logprob": -2.5566406,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.2695312,
+          "logprob": -2.2519531,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.03439331,
+          "logprob": -0.03414917,
           "special": false,
           "text": "/"
         },
@@ -55,13 +55,13 @@
         },
         {
           "id": 29896,
-          "logprob": -0.36694336,
+          "logprob": -0.3647461,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013114929,
+          "logprob": -0.012901306,
           "special": false,
           "text": "/"
         },
@@ -73,19 +73,20 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43847656,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9433594,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   },
   {
     "details": {
@@ -105,7 +106,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -113,43 +114,43 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5322266,
+          "logprob": -1.5332031,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5585938,
+          "logprob": -2.5625,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.265625,
+          "logprob": -2.2617188,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.034088135,
+          "logprob": -0.033996582,
           "special": false,
           "text": "/"
         },
         {
           "id": 29894,
-          "logprob": -0.96240234,
+          "logprob": -0.9609375,
           "special": false,
           "text": "v"
         },
         {
           "id": 29896,
-          "logprob": -0.36816406,
+          "logprob": -0.36572266,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013191223,
+          "logprob": -0.0129776,
           "special": false,
           "text": "/"
         },
@@ -161,19 +162,20 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43774414,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9443359,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   },
   {
     "details": {
@@ -193,7 +195,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -201,43 +203,43 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5322266,
+          "logprob": -1.5332031,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5585938,
+          "logprob": -2.5625,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.265625,
+          "logprob": -2.2617188,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.034088135,
+          "logprob": -0.033996582,
           "special": false,
           "text": "/"
         },
         {
           "id": 29894,
-          "logprob": -0.96240234,
+          "logprob": -0.9609375,
           "special": false,
           "text": "v"
         },
         {
           "id": 29896,
-          "logprob": -0.36816406,
+          "logprob": -0.36572266,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013191223,
+          "logprob": -0.0129776,
           "special": false,
           "text": "/"
         },
@@ -249,19 +251,20 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43774414,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9443359,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   },
   {
     "details": {
@@ -281,7 +284,7 @@
         },
         {
           "id": 2009,
-          "logprob": -11.5546875,
+          "logprob": -11.546875,
           "text": "request"
         }
       ],
@@ -289,43 +292,43 @@
       "tokens": [
         {
           "id": 363,
-          "logprob": -1.5322266,
+          "logprob": -1.5332031,
           "special": false,
           "text": " for"
         },
         {
           "id": 847,
-          "logprob": -2.5585938,
+          "logprob": -2.5625,
           "special": false,
           "text": " /"
         },
         {
           "id": 2754,
-          "logprob": -2.265625,
+          "logprob": -2.2617188,
           "special": false,
           "text": "api"
         },
         {
           "id": 29914,
-          "logprob": -0.034088135,
+          "logprob": -0.033996582,
           "special": false,
           "text": "/"
         },
         {
           "id": 29894,
-          "logprob": -0.96240234,
+          "logprob": -0.9609375,
           "special": false,
           "text": "v"
         },
         {
           "id": 29896,
-          "logprob": -0.36816406,
+          "logprob": -0.36572266,
           "special": false,
           "text": "1"
         },
         {
           "id": 29914,
-          "logprob": -0.013191223,
+          "logprob": -0.0129776,
           "special": false,
           "text": "/"
         },
@@ -337,18 +340,19 @@
         },
         {
           "id": 29914,
-          "logprob": -0.43774414,
+          "logprob": -0.4362793,
           "special": false,
           "text": "/"
         },
         {
           "id": 29896,
-          "logprob": -1.9443359,
+          "logprob": -1.9394531,
           "special": false,
           "text": "1"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "for /api/v1/projects/1"
+    "generated_text": " for /api/v1/projects/1"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2.json b/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2.json
new file mode 100644
index 00000000..f6e4bb90
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2.json
@@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2323,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 1715,
+        "logprob": -11.4375,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 25,
+        "logprob": -2.9316406,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 330,
+        "logprob": -3.5136719,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 489,
+        "logprob": -0.7783203,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 1715,
+        "logprob": -1.2314453,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 489,
+        "logprob": -2.0019531,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 2990,
+        "logprob": -1.5009766,
+        "special": false,
+        "text": " \"\\"
+      },
+      {
+        "id": 77,
+        "logprob": -0.057434082,
+        "special": false,
+        "text": "n"
+      },
+      {
+        "id": 702,
+        "logprob": -1.4912109,
+        "special": false,
+        "text": "\"\n"
+      },
+      {
+        "id": 262,
+        "logprob": -1.2636719,
+        "special": false,
+        "text": "   "
+      },
+      {
+        "id": 557,
+        "logprob": -2.4042969,
+        "special": false,
+        "text": " }\n\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_all_params.json
new file mode 100644
index 00000000..6b38e709
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_all_params.json
@@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2323,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 1715,
+        "logprob": -11.453125,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.9980469,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 578,
+        "logprob": -0.15795898,
+        "special": false,
+        "text": " The"
+      },
+      {
+        "id": 3622,
+        "logprob": -1.0458984,
+        "special": false,
+        "text": " server"
+      },
+      {
+        "id": 31680,
+        "logprob": -1.3623047,
+        "special": false,
+        "text": " responds"
+      },
+      {
+        "id": 449,
+        "logprob": 0.0,
+        "special": false,
+        "text": " with"
+      },
+      {
+        "id": 264,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 330,
+        "logprob": -0.5678711,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 1049,
+        "logprob": -0.12322998,
+        "special": false,
+        "text": "200"
+      },
+      {
+        "id": 10619,
+        "logprob": 0.0,
+        "special": false,
+        "text": " OK"
+      },
+      {
+        "id": 1,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\""
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request. The server responds with a \"200 OK\""
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_load.json b/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_load.json
new file mode 100644
index 00000000..ed369a87
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_exl2/test_flash_llama_exl2_load.json
@@ -0,0 +1,338 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2323,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -11.453125,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.9785156,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 330,
+          "logprob": -3.4941406,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 489,
+          "logprob": -0.79345703,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 1715,
+          "logprob": -1.2324219,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 489,
+          "logprob": -1.9794922,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 2990,
+          "logprob": -1.4892578,
+          "special": false,
+          "text": " \"\\"
+        },
+        {
+          "id": 77,
+          "logprob": -0.058258057,
+          "special": false,
+          "text": "n"
+        },
+        {
+          "id": 702,
+          "logprob": -1.4892578,
+          "special": false,
+          "text": "\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -1.2783203,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 557,
+          "logprob": -2.3945312,
+          "special": false,
+          "text": " }\n\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2323,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -11.40625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.9433594,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 330,
+          "logprob": -3.4726562,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 489,
+          "logprob": -0.8022461,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 1715,
+          "logprob": -1.2509766,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 489,
+          "logprob": -1.984375,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 2990,
+          "logprob": -1.4677734,
+          "special": false,
+          "text": " \"\\"
+        },
+        {
+          "id": 77,
+          "logprob": -0.059173584,
+          "special": false,
+          "text": "n"
+        },
+        {
+          "id": 702,
+          "logprob": -1.4990234,
+          "special": false,
+          "text": "\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -1.2822266,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 557,
+          "logprob": -2.3867188,
+          "special": false,
+          "text": " }\n\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2323,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -11.421875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.9511719,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 330,
+          "logprob": -3.46875,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 489,
+          "logprob": -0.77490234,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 1715,
+          "logprob": -1.2558594,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 489,
+          "logprob": -1.984375,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 2990,
+          "logprob": -1.4990234,
+          "special": false,
+          "text": " \"\\"
+        },
+        {
+          "id": 77,
+          "logprob": -0.059143066,
+          "special": false,
+          "text": "n"
+        },
+        {
+          "id": 702,
+          "logprob": -1.4941406,
+          "special": false,
+          "text": "\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 557,
+          "logprob": -2.3964844,
+          "special": false,
+          "text": " }\n\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2323,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1715,
+          "logprob": -11.4140625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.9101562,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 330,
+          "logprob": -3.5039062,
+          "special": false,
+          "text": " \""
+        },
+        {
+          "id": 489,
+          "logprob": -0.8076172,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 1715,
+          "logprob": -1.2236328,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 489,
+          "logprob": -1.9853516,
+          "special": false,
+          "text": " +"
+        },
+        {
+          "id": 2990,
+          "logprob": -1.4892578,
+          "special": false,
+          "text": " \"\\"
+        },
+        {
+          "id": 77,
+          "logprob": -0.056671143,
+          "special": false,
+          "text": "n"
+        },
+        {
+          "id": 702,
+          "logprob": -1.5107422,
+          "special": false,
+          "text": "\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -1.2597656,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 557,
+          "logprob": -2.4042969,
+          "special": false,
+          "text": " }\n\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": \" + request + \"\\n\"\n    }\n\n"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
index e4ffb83b..0f99d259 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json
@@ -5,84 +5,80 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 1,
+        "id": 2323,
         "logprob": null,
-        "text": "<s>"
-      },
-      {
-        "id": 4321,
-        "logprob": -9.59375,
         "text": "Test"
       },
       {
-        "id": 2009,
-        "logprob": -9.6640625,
-        "text": "request"
+        "id": 1715,
+        "logprob": -11.34375,
+        "text": " request"
       }
     ],
     "seed": null,
     "tokens": [
       {
-        "id": 29918,
-        "logprob": -2.3867188,
-        "special": false,
-        "text": "_"
-      },
-      {
-        "id": 5338,
-        "logprob": -2.8183594,
-        "special": false,
-        "text": "uri"
-      },
-      {
-        "id": 13,
-        "logprob": -1.6367188,
+        "id": 198,
+        "logprob": -2.5742188,
         "special": false,
         "text": "\n"
       },
       {
-        "id": 3057,
-        "logprob": -1.0527344,
+        "id": 262,
+        "logprob": -1.6230469,
         "special": false,
-        "text": "Test"
+        "text": "   "
       },
       {
-        "id": 2009,
-        "logprob": -0.6542969,
+        "id": 3270,
+        "logprob": -2.046875,
+        "special": false,
+        "text": " \"\"\"\n"
+      },
+      {
+        "id": 262,
+        "logprob": -0.015281677,
+        "special": false,
+        "text": "   "
+      },
+      {
+        "id": 422,
+        "logprob": -2.1425781,
+        "special": false,
+        "text": " if"
+      },
+      {
+        "id": 1715,
+        "logprob": -0.9238281,
         "special": false,
         "text": " request"
       },
       {
-        "id": 29918,
-        "logprob": -0.056121826,
+        "id": 13204,
+        "logprob": -0.076660156,
         "special": false,
-        "text": "_"
+        "text": ".method"
       },
       {
-        "id": 5338,
-        "logprob": -0.01600647,
+        "id": 624,
+        "logprob": -0.021987915,
         "special": false,
-        "text": "uri"
+        "text": " =="
       },
       {
-        "id": 13,
-        "logprob": -0.87939453,
+        "id": 364,
+        "logprob": -0.39208984,
         "special": false,
-        "text": "\n"
+        "text": " '"
       },
       {
-        "id": 3057,
-        "logprob": -0.7529297,
+        "id": 3019,
+        "logprob": -0.10821533,
         "special": false,
-        "text": "Test"
-      },
-      {
-        "id": 2009,
-        "logprob": -0.2980957,
-        "special": false,
-        "text": " request"
+        "text": "POST"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "_uri\nTest request_uri\nTest request"
+  "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
index 02713a00..4152b5b3 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
@@ -5,84 +5,80 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 1,
+        "id": 2323,
         "logprob": null,
-        "text": "<s>"
-      },
-      {
-        "id": 4321,
-        "logprob": -9.6015625,
         "text": "Test"
       },
       {
-        "id": 2009,
-        "logprob": -9.6640625,
-        "text": "request"
+        "id": 1715,
+        "logprob": -11.34375,
+        "text": " request"
       }
     ],
     "seed": 0,
     "tokens": [
       {
-        "id": 29899,
-        "logprob": -1.1640625,
+        "id": 13,
+        "logprob": -2.2539062,
         "special": false,
-        "text": "-"
+        "text": "."
       },
       {
-        "id": 1454,
-        "logprob": -0.07543945,
+        "id": 578,
+        "logprob": -0.15563965,
         "special": false,
-        "text": "for"
+        "text": " The"
       },
       {
-        "id": 29899,
+        "id": 3622,
+        "logprob": -0.8203125,
+        "special": false,
+        "text": " server"
+      },
+      {
+        "id": 706,
         "logprob": 0.0,
         "special": false,
-        "text": "-"
+        "text": " has"
       },
       {
-        "id": 9342,
+        "id": 539,
         "logprob": 0.0,
         "special": false,
-        "text": "comment"
+        "text": " not"
       },
       {
-        "id": 29901,
+        "id": 3686,
         "logprob": 0.0,
         "special": false,
-        "text": ":"
+        "text": " yet"
       },
       {
-        "id": 396,
-        "logprob": -0.2956543,
-        "special": false,
-        "text": " #"
-      },
-      {
-        "id": 29906,
-        "logprob": -0.52734375,
-        "special": false,
-        "text": "2"
-      },
-      {
-        "id": 29900,
-        "logprob": -0.6899414,
-        "special": false,
-        "text": "0"
-      },
-      {
-        "id": 29896,
+        "id": 3288,
         "logprob": 0.0,
         "special": false,
-        "text": "1"
+        "text": " sent"
       },
       {
-        "id": 29946,
-        "logprob": -1.5068359,
+        "id": 904,
+        "logprob": 0.0,
         "special": false,
-        "text": "4"
+        "text": " any"
+      },
+      {
+        "id": 828,
+        "logprob": 0.0,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 382,
+        "logprob": -1.5517578,
+        "special": false,
+        "text": ".\n\n"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "Test request-for-comment: #2014"
+  "generated_text": "Test request. The server has not yet sent any data.\n\n"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
index 88bfa4f9..75e90303 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json
@@ -6,86 +6,82 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.6015625,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.671875,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 29918,
-          "logprob": -2.3828125,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.8105469,
-          "special": false,
-          "text": "uri"
-        },
-        {
-          "id": 13,
-          "logprob": -1.6396484,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.0546875,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -0.6513672,
+          "id": 3270,
+          "logprob": -2.0410156,
+          "special": false,
+          "text": " \"\"\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -0.015281677,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 422,
+          "logprob": -2.1445312,
+          "special": false,
+          "text": " if"
+        },
+        {
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 29918,
-          "logprob": -0.056365967,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "_"
+          "text": ".method"
         },
         {
-          "id": 5338,
-          "logprob": -0.016082764,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "uri"
+          "text": " =="
         },
         {
-          "id": 13,
-          "logprob": -0.87841797,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": "\n"
+          "text": " '"
         },
         {
-          "id": 3057,
-          "logprob": -0.7548828,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "Test"
-        },
-        {
-          "id": 2009,
-          "logprob": -0.29711914,
-          "special": false,
-          "text": " request"
+          "text": "POST"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   },
   {
     "details": {
@@ -94,86 +90,82 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.6015625,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.6640625,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 29918,
-          "logprob": -2.3828125,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.828125,
-          "special": false,
-          "text": "uri"
-        },
-        {
-          "id": 13,
-          "logprob": -1.6386719,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.0527344,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -0.6542969,
+          "id": 3270,
+          "logprob": -2.0410156,
+          "special": false,
+          "text": " \"\"\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -0.015281677,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 422,
+          "logprob": -2.1445312,
+          "special": false,
+          "text": " if"
+        },
+        {
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 29918,
-          "logprob": -0.055877686,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "_"
+          "text": ".method"
         },
         {
-          "id": 5338,
-          "logprob": -0.016021729,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "uri"
+          "text": " =="
         },
         {
-          "id": 13,
-          "logprob": -0.8769531,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": "\n"
+          "text": " '"
         },
         {
-          "id": 3057,
-          "logprob": -0.7583008,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "Test"
-        },
-        {
-          "id": 2009,
-          "logprob": -0.29833984,
-          "special": false,
-          "text": " request"
+          "text": "POST"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   },
   {
     "details": {
@@ -182,86 +174,82 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.6015625,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.671875,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 29918,
-          "logprob": -2.3847656,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.8144531,
-          "special": false,
-          "text": "uri"
-        },
-        {
-          "id": 13,
-          "logprob": -1.6396484,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.0527344,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -0.65478516,
+          "id": 3270,
+          "logprob": -2.0410156,
+          "special": false,
+          "text": " \"\"\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -0.015281677,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 422,
+          "logprob": -2.1445312,
+          "special": false,
+          "text": " if"
+        },
+        {
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 29918,
-          "logprob": -0.056243896,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "_"
+          "text": ".method"
         },
         {
-          "id": 5338,
-          "logprob": -0.016143799,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "uri"
+          "text": " =="
         },
         {
-          "id": 13,
-          "logprob": -0.8808594,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": "\n"
+          "text": " '"
         },
         {
-          "id": 3057,
-          "logprob": -0.75341797,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "Test"
-        },
-        {
-          "id": 2009,
-          "logprob": -0.2956543,
-          "special": false,
-          "text": " request"
+          "text": "POST"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   },
   {
     "details": {
@@ -270,85 +258,81 @@
       "generated_tokens": 10,
       "prefill": [
         {
-          "id": 1,
+          "id": 2323,
           "logprob": null,
-          "text": "<s>"
-        },
-        {
-          "id": 4321,
-          "logprob": -9.6015625,
           "text": "Test"
         },
         {
-          "id": 2009,
-          "logprob": -9.6640625,
-          "text": "request"
+          "id": 1715,
+          "logprob": -11.34375,
+          "text": " request"
         }
       ],
       "seed": null,
       "tokens": [
         {
-          "id": 29918,
-          "logprob": -2.3769531,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.8183594,
-          "special": false,
-          "text": "uri"
-        },
-        {
-          "id": 13,
-          "logprob": -1.6396484,
+          "id": 198,
+          "logprob": -2.5742188,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 3057,
-          "logprob": -1.0546875,
+          "id": 262,
+          "logprob": -1.6220703,
           "special": false,
-          "text": "Test"
+          "text": "   "
         },
         {
-          "id": 2009,
-          "logprob": -0.65478516,
+          "id": 3270,
+          "logprob": -2.0410156,
+          "special": false,
+          "text": " \"\"\"\n"
+        },
+        {
+          "id": 262,
+          "logprob": -0.015281677,
+          "special": false,
+          "text": "   "
+        },
+        {
+          "id": 422,
+          "logprob": -2.1445312,
+          "special": false,
+          "text": " if"
+        },
+        {
+          "id": 1715,
+          "logprob": -0.92333984,
           "special": false,
           "text": " request"
         },
         {
-          "id": 29918,
-          "logprob": -0.05557251,
+          "id": 13204,
+          "logprob": -0.07672119,
           "special": false,
-          "text": "_"
+          "text": ".method"
         },
         {
-          "id": 5338,
-          "logprob": -0.01612854,
+          "id": 624,
+          "logprob": -0.021987915,
           "special": false,
-          "text": "uri"
+          "text": " =="
         },
         {
-          "id": 13,
-          "logprob": -0.8730469,
+          "id": 364,
+          "logprob": -0.39208984,
           "special": false,
-          "text": "\n"
+          "text": " '"
         },
         {
-          "id": 3057,
-          "logprob": -0.7519531,
+          "id": 3019,
+          "logprob": -0.10638428,
           "special": false,
-          "text": "Test"
-        },
-        {
-          "id": 2009,
-          "logprob": -0.29785156,
-          "special": false,
-          "text": " request"
+          "text": "POST"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\n    \"\"\"\n    if request.method == 'POST"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin.json
new file mode 100644
index 00000000..47849a3f
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -12.390625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -11.0625,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.0507812,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -2.3007812,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 29902,
+        "logprob": -2.0449219,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 505,
+        "logprob": -1.3242188,
+        "special": false,
+        "text": " have"
+      },
+      {
+        "id": 263,
+        "logprob": -0.2076416,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1243,
+        "logprob": -2.0273438,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.6845703,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 515,
+        "logprob": -1.1748047,
+        "special": false,
+        "text": " from"
+      },
+      {
+        "id": 263,
+        "logprob": -1.0644531,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1404,
+        "logprob": -1.5224609,
+        "special": false,
+        "text": " user"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nI have a test request from a user"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_all_params.json
new file mode 100644
index 00000000..bda2393e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -12.390625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -11.0625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 5229,
+        "logprob": -1.2607422,
+        "special": false,
+        "text": " failed"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 6527,
+        "logprob": -0.11450195,
+        "special": false,
+        "text": " Could"
+      },
+      {
+        "id": 451,
+        "logprob": 0.0,
+        "special": false,
+        "text": " not"
+      },
+      {
+        "id": 4511,
+        "logprob": -0.2286377,
+        "special": false,
+        "text": " connect"
+      },
+      {
+        "id": 304,
+        "logprob": 0.0,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 1923,
+        "logprob": -1.2568359,
+        "special": false,
+        "text": " server"
+      },
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.15905762,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 29902,
+        "logprob": -0.21618652,
+        "special": false,
+        "text": "I"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request failed: Could not connect to server\n\nI"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_load.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_load.json
new file mode 100644
index 00000000..44c26efb
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin/test_flash_llama_marlin_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -12.390625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.0625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.0507812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -2.3007812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29902,
+          "logprob": -2.0449219,
+          "special": false,
+          "text": "I"
+        },
+        {
+          "id": 505,
+          "logprob": -1.3242188,
+          "special": false,
+          "text": " have"
+        },
+        {
+          "id": 263,
+          "logprob": -0.2076416,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1243,
+          "logprob": -2.0273438,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.6845703,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 515,
+          "logprob": -1.1748047,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 263,
+          "logprob": -1.0595703,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1404,
+          "logprob": -1.5224609,
+          "special": false,
+          "text": " user"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nI have a test request from a user"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -12.390625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.0625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.0507812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -2.3007812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29902,
+          "logprob": -2.0449219,
+          "special": false,
+          "text": "I"
+        },
+        {
+          "id": 505,
+          "logprob": -1.3242188,
+          "special": false,
+          "text": " have"
+        },
+        {
+          "id": 263,
+          "logprob": -0.2076416,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1243,
+          "logprob": -2.0273438,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.6845703,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 515,
+          "logprob": -1.1748047,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 263,
+          "logprob": -1.0595703,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1404,
+          "logprob": -1.5224609,
+          "special": false,
+          "text": " user"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nI have a test request from a user"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -12.390625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.0625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.0507812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -2.3007812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29902,
+          "logprob": -2.0449219,
+          "special": false,
+          "text": "I"
+        },
+        {
+          "id": 505,
+          "logprob": -1.3242188,
+          "special": false,
+          "text": " have"
+        },
+        {
+          "id": 263,
+          "logprob": -0.2076416,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1243,
+          "logprob": -2.0273438,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.6845703,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 515,
+          "logprob": -1.1748047,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 263,
+          "logprob": -1.0595703,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1404,
+          "logprob": -1.5224609,
+          "special": false,
+          "text": " user"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nI have a test request from a user"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -12.390625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -11.0625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.0507812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -2.3007812,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 29902,
+          "logprob": -2.0449219,
+          "special": false,
+          "text": "I"
+        },
+        {
+          "id": 505,
+          "logprob": -1.3242188,
+          "special": false,
+          "text": " have"
+        },
+        {
+          "id": 263,
+          "logprob": -0.2076416,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1243,
+          "logprob": -2.0273438,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.6845703,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 515,
+          "logprob": -1.1748047,
+          "special": false,
+          "text": " from"
+        },
+        {
+          "id": 263,
+          "logprob": -1.0595703,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1404,
+          "logprob": -1.5224609,
+          "special": false,
+          "text": " user"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nI have a test request from a user"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_all_params.json b/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_all_params.json
new file mode 100644
index 00000000..d8a298eb
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_all_params.json
@@ -0,0 +1,98 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 338,
+        "logprob": -10.0078125,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -15.515625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -2.8847656,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -4.140625,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.1582031,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2772,
+        "logprob": -0.23083496,
+        "special": false,
+        "text": "De"
+      },
+      {
+        "id": 1022,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ep"
+      },
+      {
+        "id": 6509,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 29892,
+        "logprob": -0.61816406,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 607,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " which"
+      },
+      {
+        "id": 508,
+        "logprob": -1.7724609,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 367,
+        "logprob": 0.0,
+        "special": false,
+        "text": " be"
+      },
+      {
+        "id": 5545,
+        "logprob": 0.0,
+        "special": false,
+        "text": " considered"
+      },
+      {
+        "id": 408,
+        "logprob": -0.3869629,
+        "special": false,
+        "text": " as"
+      }
+    ]
+  },
+  "generated_text": "What is Deep Learning?\nDeep learning, which can be considered as"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_load.json b/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_load.json
new file mode 100644
index 00000000..413af1d7
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_load.json
@@ -0,0 +1,414 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2753906,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.48046875,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1845703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.5727539,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.00010967255,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.04510498,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.00020992756,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.0046539307,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025844574,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2724609,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.47729492,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1826172,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.56689453,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.000108003616,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.044433594,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.0002104044,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.004711151,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025892258,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2724609,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.47729492,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1826172,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.56689453,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.000108003616,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.044433594,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.0002104044,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.004711151,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025892258,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2724609,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.47729492,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1826172,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.56689453,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.000108003616,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.044433594,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.0002104044,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.004711151,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025892258,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_simple.json b/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_simple.json
new file mode 100644
index 00000000..15754b14
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_medusa/test_flash_medusa_simple.json
@@ -0,0 +1,103 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -10.734375,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.5488281,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.2890625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.2753906,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.48046875,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.1845703,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2772,
+        "logprob": -0.5727539,
+        "special": false,
+        "text": "De"
+      },
+      {
+        "id": 1022,
+        "logprob": -0.000108122826,
+        "special": false,
+        "text": "ep"
+      },
+      {
+        "id": 6509,
+        "logprob": -0.1239624,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 338,
+        "logprob": -0.044433594,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 263,
+        "logprob": -0.01852417,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 11306,
+        "logprob": -0.45922852,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 310,
+        "logprob": -0.0002104044,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 4933,
+        "logprob": -0.004787445,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6509,
+        "logprob": -0.00026226044,
+        "special": false,
+        "text": " learning"
+      }
+    ]
+  },
+  "generated_text": "\nDeep learning is a subset of machine learning"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json
new file mode 100644
index 00000000..4e7de9a6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -12.9140625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.7578125,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 28747,
+        "logprob": -0.54785156,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 3169,
+        "logprob": -1.4091797,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 307,
+        "logprob": -3.0273438,
+        "special": false,
+        "text": " n"
+      },
+      {
+        "id": 327,
+        "logprob": -0.94433594,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 28705,
+        "logprob": -0.81347656,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -1.2958984,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 28734,
+        "logprob": -2.0644531,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 387,
+        "logprob": -1.9580078,
+        "special": false,
+        "text": " -"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28740,
+        "logprob": -1.1816406,
+        "special": false,
+        "text": "1"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ": Let n = 10 - 1"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json
new file mode 100644
index 00000000..c0dc6471
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -12.9140625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.7578125,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 28747,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 3169,
+        "logprob": -0.1307373,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 332,
+        "logprob": -2.3359375,
+        "special": false,
+        "text": " u"
+      },
+      {
+        "id": 347,
+        "logprob": 0.0,
+        "special": false,
+        "text": " be"
+      },
+      {
+        "id": 325,
+        "logprob": -1.0234375,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 28734,
+        "logprob": -2.0292969,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 648,
+        "logprob": -1.0439453,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 28705,
+        "logprob": -0.24499512,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 28770,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 387,
+        "logprob": -1.5507812,
+        "special": false,
+        "text": " -"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request: Let u be (0 + 3 -"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json
new file mode 100644
index 00000000..9d133077
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_mistral/test_flash_mistral_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.55078125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4140625,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0273438,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94140625,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8173828,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2978516,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0664062,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9560547,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1787109,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.54785156,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4111328,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0292969,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94433594,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8178711,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2939453,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0644531,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9550781,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1796875,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.55078125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4140625,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0273438,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94140625,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8173828,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2978516,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0664062,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9560547,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1787109,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 3735,
+          "logprob": -12.9140625,
+          "text": "Test"
+        },
+        {
+          "id": 2159,
+          "logprob": -10.7578125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 28747,
+          "logprob": -0.55078125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 3169,
+          "logprob": -1.4140625,
+          "special": false,
+          "text": " Let"
+        },
+        {
+          "id": 307,
+          "logprob": -3.0273438,
+          "special": false,
+          "text": " n"
+        },
+        {
+          "id": 327,
+          "logprob": -0.94140625,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 28705,
+          "logprob": -0.8173828,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.2978516,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 28734,
+          "logprob": -2.0664062,
+          "special": false,
+          "text": "0"
+        },
+        {
+          "id": 387,
+          "logprob": -1.9560547,
+          "special": false,
+          "text": " -"
+        },
+        {
+          "id": 28705,
+          "logprob": -0.5078125,
+          "special": false,
+          "text": " "
+        },
+        {
+          "id": 28740,
+          "logprob": -1.1787109,
+          "special": false,
+          "text": "1"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": Let n = 10 - 1"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json
new file mode 100644
index 00000000..037e0b16
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma.json
@@ -0,0 +1,25 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 2,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 54901,
+        "logprob": -0.72753906,
+        "special": false,
+        "text": "beach"
+      },
+      {
+        "id": 1,
+        "logprob": -0.011009216,
+        "special": true,
+        "text": "<eos>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "beach"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
new file mode 100644
index 00000000..ab4f3015
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
@@ -0,0 +1,61 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 8,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2502,
+        "logprob": -1.734375,
+        "special": false,
+        "text": "image"
+      },
+      {
+        "id": 2196,
+        "logprob": -0.5756836,
+        "special": false,
+        "text": " result"
+      },
+      {
+        "id": 604,
+        "logprob": -0.007843018,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 12254,
+        "logprob": -1.7167969,
+        "special": false,
+        "text": " chicken"
+      },
+      {
+        "id": 611,
+        "logprob": -0.17053223,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 573,
+        "logprob": -0.7626953,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 8318,
+        "logprob": -0.02709961,
+        "special": false,
+        "text": " beach"
+      },
+      {
+        "id": 1,
+        "logprob": -0.20739746,
+        "special": true,
+        "text": "<eos>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "image result for chicken on the beach"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi.json b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi.json
new file mode 100644
index 00000000..51d969b2
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi.json
@@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 14402,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 2581,
+        "logprob": -11.6171875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 25,
+        "logprob": -2.3203125,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 1391,
+        "logprob": -0.98779297,
+        "special": false,
+        "text": " {"
+      },
+      {
+        "id": 25927,
+        "logprob": -0.76660156,
+        "special": false,
+        "text": "request"
+      },
+      {
+        "id": 92,
+        "logprob": -0.7246094,
+        "special": false,
+        "text": "}"
+      },
+      {
+        "id": 4943,
+        "logprob": -0.41333008,
+        "special": false,
+        "text": "\")"
+      },
+      {
+        "id": 198,
+        "logprob": -0.11785889,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 50280,
+        "logprob": -0.97265625,
+        "special": false,
+        "text": "        "
+      },
+      {
+        "id": 26209,
+        "logprob": -1.4414062,
+        "special": false,
+        "text": "response"
+      },
+      {
+        "id": 796,
+        "logprob": -0.0569458,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 2116,
+        "logprob": -1.1533203,
+        "special": false,
+        "text": " self"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ": {request}\")\n        response = self"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json
new file mode 100644
index 00000000..221ff13d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_all_params.json
@@ -0,0 +1,60 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "stop_sequence",
+    "generated_tokens": 6,
+    "prefill": [
+      {
+        "id": 14402,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 2581,
+        "logprob": -11.6171875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 284,
+        "logprob": -0.19421387,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3758,
+        "logprob": -0.62597656,
+        "special": false,
+        "text": " send"
+      },
+      {
+        "id": 1366,
+        "logprob": -0.87060547,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 625,
+        "logprob": -0.88427734,
+        "special": false,
+        "text": " over"
+      },
+      {
+        "id": 257,
+        "logprob": -1.0830078,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 3127,
+        "logprob": -1.9462891,
+        "special": false,
+        "text": " network"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request to send data over a network"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_load.json b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_load.json
new file mode 100644
index 00000000..62f7fd32
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_phi/test_flash_phi_load.json
@@ -0,0 +1,338 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2.json b/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2.json
new file mode 100644
index 00000000..7219f9e6
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2.json
@@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2271,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 1681,
+        "logprob": -8.8515625,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 198,
+        "logprob": -2.9023438,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2,
+        "logprob": -2.9160156,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 4230,
+        "logprob": -3.1035156,
+        "special": false,
+        "text": " Create"
+      },
+      {
+        "id": 264,
+        "logprob": -1.1025391,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1681,
+        "logprob": -1.6914062,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 198,
+        "logprob": -1.1953125,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2035,
+        "logprob": -1.3203125,
+        "special": false,
+        "text": "request"
+      },
+      {
+        "id": 284,
+        "logprob": -0.13537598,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 7388,
+        "logprob": -1.2402344,
+        "special": false,
+        "text": " requests"
+      },
+      {
+        "id": 670,
+        "logprob": -0.2775879,
+        "special": false,
+        "text": ".get"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n# Create a request\nrequest = requests.get"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_all_params.json b/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_all_params.json
new file mode 100644
index 00000000..4a2936af
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_all_params.json
@@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2271,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 1681,
+        "logprob": -8.8515625,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 311,
+        "logprob": -1.4277344,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 279,
+        "logprob": -0.65478516,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2473,
+        "logprob": -1.8300781,
+        "special": false,
+        "text": " service"
+      },
+      {
+        "id": 382,
+        "logprob": -0.75,
+        "special": false,
+        "text": ".\n\n"
+      },
+      {
+        "id": 286,
+        "logprob": -0.11621094,
+        "special": false,
+        "text": "       "
+      },
+      {
+        "id": 549,
+        "logprob": 0.0,
+        "special": false,
+        "text": " :"
+      },
+      {
+        "id": 689,
+        "logprob": -0.48608398,
+        "special": false,
+        "text": "return"
+      },
+      {
+        "id": 25,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 5949,
+        "logprob": -0.5756836,
+        "special": false,
+        "text": " Response"
+      },
+      {
+        "id": 504,
+        "logprob": -0.24499512,
+        "special": false,
+        "text": " from"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request to the service.\n\n        :return: Response from"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_load.json b/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_load.json
new file mode 100644
index 00000000..4786ff24
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_qwen2/test_flash_qwen2_load.json
@@ -0,0 +1,338 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2.json b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2.json
new file mode 100644
index 00000000..36a2ff4d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2.json
@@ -0,0 +1,94 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 610,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": -5.2617188,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": -0.38476562,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": -7.640625,
+        "text": "hello"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2284,
+        "logprob": -0.92626953,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 303,
+        "logprob": -0.40844727,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": -0.27905273,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": -0.6118164,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": -0.68652344,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10914,
+        "logprob": -1.4619141,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 16013,
+        "logprob": -0.7993164,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": -0.63134766,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": -0.23278809,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": -1.2294922,
+        "special": false,
+        "text": "def"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
new file mode 100644
index 00000000..38117272
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_default_params.json
@@ -0,0 +1,394 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 60,
+    "prefill": [
+      {
+        "id": 610,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": -5.2617188,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": -0.38476562,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": -7.640625,
+        "text": "hello"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 2284,
+        "logprob": -0.296875,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 303,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": 0.0,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": -0.28125,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10914,
+        "logprob": -0.79248047,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 16013,
+        "logprob": -0.61816406,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": -0.0619812,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": -0.4091797,
+        "special": false,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": 0.0,
+        "special": false,
+        "text": "hello"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 444,
+        "logprob": -0.21655273,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 45,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 731,
+        "logprob": 0.0,
+        "special": false,
+        "text": "):"
+      },
+      {
+        "id": 303,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": 0.0,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 332,
+        "logprob": -0.034698486,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 655,
+        "logprob": 0.0,
+        "special": false,
+        "text": " name"
+      },
+      {
+        "id": 494,
+        "logprob": -0.20141602,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 16013,
+        "logprob": 0.0,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": 0.0,
+        "special": false,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": 0.0,
+        "special": false,
+        "text": "hello"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 400,
+        "logprob": 0.0,
+        "special": false,
+        "text": "age"
+      },
+      {
+        "id": 45,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 49,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 11505,
+        "logprob": 0.0,
+        "special": false,
+        "text": " age"
+      },
+      {
+        "id": 731,
+        "logprob": 0.0,
+        "special": false,
+        "text": "):"
+      },
+      {
+        "id": 303,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": 0.0,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 655,
+        "logprob": 0.0,
+        "special": false,
+        "text": " name"
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 3021,
+        "logprob": -0.5761719,
+        "special": false,
+        "text": " \","
+      },
+      {
+        "id": 863,
+        "logprob": 0.0,
+        "special": false,
+        "text": " you"
+      },
+      {
+        "id": 904,
+        "logprob": 0.0,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 615,
+        "logprob": 0.0,
+        "special": false,
+        "text": " str"
+      },
+      {
+        "id": 45,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 400,
+        "logprob": 0.0,
+        "special": false,
+        "text": "age"
+      },
+      {
+        "id": 46,
+        "logprob": 0.0,
+        "special": false,
+        "text": ")"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name + \"!\")\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \", you are \" + str(age)"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_load.json b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_load.json
new file mode 100644
index 00000000..9e82d4be
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder2/test_flash_starcoder2_load.json
@@ -0,0 +1,378 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
index 53055e42..5e537bb7 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
@@ -1,193 +1,194 @@
 {
-  "generated_text": "\n    return sum(L) / len(L)\n\n\ndef geometric_mean(L",
   "details": {
     "best_of_sequences": null,
     "finish_reason": "length",
     "generated_tokens": 20,
-    "seed": null,
     "prefill": [
       {
         "id": 589,
-        "text": "def",
-        "logprob": null
+        "logprob": null,
+        "text": "def"
       },
       {
         "id": 3226,
-        "text": " ge",
-        "logprob": -9.0234375
+        "logprob": -8.5859375,
+        "text": " ge"
       },
       {
         "id": 21017,
-        "text": "ometric",
-        "logprob": -9.0859375
+        "logprob": -7.5859375,
+        "text": "ometric"
       },
       {
         "id": 81,
-        "text": "_",
-        "logprob": -0.25878906
+        "logprob": -0.2668457,
+        "text": "_"
       },
       {
         "id": 6009,
-        "text": "mean",
-        "logprob": -2.2109375
+        "logprob": -1.6416016,
+        "text": "mean"
       },
       {
         "id": 26,
-        "text": "(",
-        "logprob": -0.30371094
+        "logprob": -0.22705078,
+        "text": "("
       },
       {
         "id": 62,
-        "text": "L",
-        "logprob": -5.6054688
+        "logprob": -5.2304688,
+        "text": "L"
       },
       {
         "id": 44,
-        "text": ":",
-        "logprob": -3.0722656
+        "logprob": -3.0976562,
+        "text": ":"
       },
       {
         "id": 1682,
-        "text": " List",
-        "logprob": -0.6879883
+        "logprob": -1.1044922,
+        "text": " List"
       },
       {
         "id": 77,
-        "text": "[",
-        "logprob": -0.38500977
+        "logprob": -0.14294434,
+        "text": "["
       },
       {
         "id": 1808,
-        "text": "float",
-        "logprob": -0.984375
+        "logprob": -0.32299805,
+        "text": "float"
       },
       {
         "id": 10794,
-        "text": "]):",
-        "logprob": -2.5351562
+        "logprob": -2.8164062,
+        "text": "]):"
       }
     ],
+    "seed": null,
     "tokens": [
       {
         "id": 284,
-        "text": "\n   ",
-        "logprob": -1.1738281,
-        "special": false
+        "logprob": -0.1282959,
+        "special": false,
+        "text": "\n   "
       },
       {
-        "id": 442,
-        "text": " return",
-        "logprob": -0.95947266,
-        "special": false
+        "id": 1524,
+        "logprob": -0.97998047,
+        "special": false,
+        "text": " \"\"\""
       },
       {
-        "id": 3632,
-        "text": " sum",
-        "logprob": -1.4199219,
-        "special": false
+        "id": 284,
+        "logprob": -0.7006836,
+        "special": false,
+        "text": "\n   "
       },
       {
-        "id": 26,
-        "text": "(",
-        "logprob": -0.085876465,
-        "special": false
+        "id": 14883,
+        "logprob": -2.1933594,
+        "special": false,
+        "text": " Calculate"
       },
       {
-        "id": 62,
-        "text": "L",
-        "logprob": -0.09875488,
-        "special": false
-      },
-      {
-        "id": 27,
-        "text": ")",
-        "logprob": -0.30517578,
-        "special": false
-      },
-      {
-        "id": 517,
-        "text": " /",
-        "logprob": -0.42089844,
-        "special": false
-      },
-      {
-        "id": 2069,
-        "text": " len",
-        "logprob": -0.042053223,
-        "special": false
-      },
-      {
-        "id": 26,
-        "text": "(",
-        "logprob": -0.0011806488,
-        "special": false
-      },
-      {
-        "id": 62,
-        "text": "L",
-        "logprob": -0.0005259514,
-        "special": false
-      },
-      {
-        "id": 27,
-        "text": ")",
-        "logprob": -0.0017633438,
-        "special": false
-      },
-      {
-        "id": 478,
-        "text": "\n\n",
-        "logprob": -0.69189453,
-        "special": false
-      },
-      {
-        "id": 203,
-        "text": "\n",
-        "logprob": -0.041870117,
-        "special": false
-      },
-      {
-        "id": 589,
-        "text": "def",
-        "logprob": -0.27856445,
-        "special": false
+        "id": 322,
+        "logprob": -0.2697754,
+        "special": false,
+        "text": " the"
       },
       {
         "id": 3226,
-        "text": " ge",
-        "logprob": -1.7255859,
-        "special": false
+        "logprob": -0.0836792,
+        "special": false,
+        "text": " ge"
       },
       {
         "id": 21017,
-        "text": "ometric",
-        "logprob": -0.011291504,
-        "special": false
+        "logprob": -0.018737793,
+        "special": false,
+        "text": "ometric"
       },
       {
-        "id": 81,
-        "text": "_",
-        "logprob": -0.008430481,
-        "special": false
+        "id": 5651,
+        "logprob": -0.028640747,
+        "special": false,
+        "text": " mean"
       },
       {
-        "id": 6009,
-        "text": "mean",
-        "logprob": -0.025787354,
-        "special": false
+        "id": 432,
+        "logprob": -0.29467773,
+        "special": false,
+        "text": " of"
       },
       {
-        "id": 26,
-        "text": "(",
-        "logprob": -0.073913574,
-        "special": false
+        "id": 312,
+        "logprob": -0.31518555,
+        "special": false,
+        "text": " a"
       },
       {
-        "id": 62,
-        "text": "L",
-        "logprob": -0.09967041,
-        "special": false
+        "id": 1149,
+        "logprob": -0.20605469,
+        "special": false,
+        "text": " list"
+      },
+      {
+        "id": 432,
+        "logprob": -0.23254395,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 7515,
+        "logprob": -0.4489746,
+        "special": false,
+        "text": " numbers"
+      },
+      {
+        "id": 32,
+        "logprob": -0.6044922,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 446,
+        "logprob": -0.63964844,
+        "special": false,
+        "text": "\n\n   "
+      },
+      {
+        "id": 499,
+        "logprob": -1.1953125,
+        "special": false,
+        "text": " :"
+      },
+      {
+        "id": 753,
+        "logprob": -0.03515625,
+        "special": false,
+        "text": "param"
+      },
+      {
+        "id": 498,
+        "logprob": -0.06311035,
+        "special": false,
+        "text": " L"
+      },
+      {
+        "id": 44,
+        "logprob": -0.003414154,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 1682,
+        "logprob": -1.3310547,
+        "special": false,
+        "text": " List"
       }
-    ]
-  }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a list of numbers.\n\n    :param L: List"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
index 5598a2ad..bf0f5146 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
@@ -11,57 +11,57 @@
       },
       {
         "id": 3226,
-        "logprob": -9.0234375,
+        "logprob": -8.5859375,
         "text": " ge"
       },
       {
         "id": 21017,
-        "logprob": -9.09375,
+        "logprob": -7.5898438,
         "text": "ometric"
       },
       {
         "id": 81,
-        "logprob": -0.25976562,
+        "logprob": -0.26586914,
         "text": "_"
       },
       {
         "id": 6009,
-        "logprob": -2.2148438,
+        "logprob": -1.6347656,
         "text": "mean"
       },
       {
         "id": 26,
-        "logprob": -0.3010254,
+        "logprob": -0.22705078,
         "text": "("
       },
       {
         "id": 62,
-        "logprob": -5.6757812,
+        "logprob": -5.2382812,
         "text": "L"
       },
       {
         "id": 44,
-        "logprob": -3.0898438,
+        "logprob": -3.0996094,
         "text": ":"
       },
       {
         "id": 1682,
-        "logprob": -0.6791992,
+        "logprob": -1.1025391,
         "text": " List"
       },
       {
         "id": 77,
-        "logprob": -0.38891602,
+        "logprob": -0.14294434,
         "text": "["
       },
       {
         "id": 1808,
-        "logprob": -0.92041016,
+        "logprob": -0.32226562,
         "text": "float"
       },
       {
         "id": 10794,
-        "logprob": -2.5390625,
+        "logprob": -2.8164062,
         "text": "]):"
       }
     ],
@@ -75,13 +75,13 @@
       },
       {
         "id": 442,
-        "logprob": 0.0,
+        "logprob": -1.3134766,
         "special": false,
         "text": " return"
       },
       {
         "id": 11665,
-        "logprob": -1.6005859,
+        "logprob": -0.10021973,
         "special": false,
         "text": " reduce"
       },
@@ -129,7 +129,7 @@
       },
       {
         "id": 319,
-        "logprob": 0.0,
+        "logprob": -0.42871094,
         "special": false,
         "text": " *"
       },
@@ -158,36 +158,37 @@
         "text": ")"
       },
       {
-        "id": 203,
-        "logprob": -0.11968994,
-        "special": false,
-        "text": "\n"
-      },
-      {
-        "id": 203,
+        "id": 1115,
         "logprob": 0.0,
         "special": false,
-        "text": "\n"
+        "text": " **"
       },
       {
-        "id": 589,
+        "id": 308,
         "logprob": 0.0,
         "special": false,
-        "text": "def"
+        "text": " ("
       },
       {
-        "id": 3226,
+        "id": 35,
         "logprob": 0.0,
         "special": false,
-        "text": " ge"
+        "text": "1"
       },
       {
-        "id": 21017,
+        "id": 32,
+        "logprob": -0.31323242,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 34,
         "logprob": 0.0,
         "special": false,
-        "text": "ometric"
+        "text": "0"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "\n    return reduce(lambda x, y: x * y, L)\n\ndef geometric"
+  "generated_text": "\n    return reduce(lambda x, y: x * y, L) ** (1.0"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
index 5381ce5a..46a21ed8 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
@@ -12,57 +12,57 @@
         },
         {
           "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.5820312,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.25927734,
+          "logprob": -0.26708984,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -2.25,
+          "logprob": -1.6386719,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.30126953,
+          "logprob": -0.22717285,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.7539062,
+          "logprob": -5.234375,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.0878906,
+          "logprob": -3.1015625,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -0.6845703,
+          "logprob": -1.1083984,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.3918457,
+          "logprob": -0.14294434,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.8798828,
+          "logprob": -0.32592773,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.4980469,
+          "logprob": -2.8164062,
           "text": "]):"
         }
       ],
@@ -70,67 +70,68 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -1.1533203,
+          "logprob": -0.12817383,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 442,
-          "logprob": -0.91796875,
+          "id": 1524,
+          "logprob": -0.9863281,
           "special": false,
-          "text": " return"
+          "text": " \"\"\""
         },
         {
-          "id": 3632,
-          "logprob": -1.3291016,
+          "id": 284,
+          "logprob": -0.7011719,
           "special": false,
-          "text": " sum"
+          "text": "\n   "
         },
         {
-          "id": 26,
-          "logprob": -0.08062744,
+          "id": 14883,
+          "logprob": -2.2050781,
           "special": false,
-          "text": "("
+          "text": " Calculate"
         },
         {
-          "id": 62,
-          "logprob": -0.097717285,
+          "id": 322,
+          "logprob": -0.2668457,
           "special": false,
-          "text": "L"
+          "text": " the"
         },
         {
-          "id": 27,
-          "logprob": -0.29003906,
+          "id": 3226,
+          "logprob": -0.08465576,
           "special": false,
-          "text": ")"
+          "text": " ge"
         },
         {
-          "id": 517,
-          "logprob": -0.34958984,
+          "id": 21017,
+          "logprob": -0.019012451,
           "special": false,
-          "text": " /"
+          "text": "ometric"
         },
         {
-          "id": 2069,
-          "logprob": -0.03829956,
+          "id": 5651,
+          "logprob": -0.028625488,
           "special": false,
-          "text": " len"
+          "text": " mean"
         },
         {
-          "id": 26,
-          "logprob": -0.0011987686,
+          "id": 432,
+          "logprob": -0.29418945,
           "special": false,
-          "text": "("
+          "text": " of"
         },
         {
-          "id": 62,
-          "logprob": -0.00050878525,
+          "id": 312,
+          "logprob": -0.3161621,
           "special": false,
-          "text": "L"
+          "text": " a"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
   },
   {
     "details": {
@@ -145,57 +146,57 @@
         },
         {
           "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.59375,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.25878906,
+          "logprob": -0.26953125,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -2.2109375,
+          "logprob": -1.640625,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.30371094,
+          "logprob": -0.22705078,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.6054688,
+          "logprob": -5.234375,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.0722656,
+          "logprob": -3.1132812,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -0.6879883,
+          "logprob": -1.1123047,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.38500977,
+          "logprob": -0.14294434,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.984375,
+          "logprob": -0.32299805,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.5351562,
+          "logprob": -2.8164062,
           "text": "]):"
         }
       ],
@@ -203,67 +204,68 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -1.1738281,
+          "logprob": -0.12854004,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 442,
-          "logprob": -0.9584961,
+          "id": 1524,
+          "logprob": -0.9897461,
           "special": false,
-          "text": " return"
+          "text": " \"\"\""
         },
         {
-          "id": 3632,
-          "logprob": -1.4169922,
+          "id": 284,
+          "logprob": -0.69970703,
           "special": false,
-          "text": " sum"
+          "text": "\n   "
         },
         {
-          "id": 26,
-          "logprob": -0.085876465,
+          "id": 14883,
+          "logprob": -2.2050781,
           "special": false,
-          "text": "("
+          "text": " Calculate"
         },
         {
-          "id": 62,
-          "logprob": -0.0982666,
+          "id": 322,
+          "logprob": -0.2668457,
           "special": false,
-          "text": "L"
+          "text": " the"
         },
         {
-          "id": 27,
-          "logprob": -0.3022461,
+          "id": 3226,
+          "logprob": -0.08496094,
           "special": false,
-          "text": ")"
+          "text": " ge"
         },
         {
-          "id": 517,
-          "logprob": -0.40504883,
+          "id": 21017,
+          "logprob": -0.019012451,
           "special": false,
-          "text": " /"
+          "text": "ometric"
         },
         {
-          "id": 2069,
-          "logprob": -0.041656494,
+          "id": 5651,
+          "logprob": -0.029037476,
           "special": false,
-          "text": " len"
+          "text": " mean"
         },
         {
-          "id": 26,
-          "logprob": -0.0011844635,
+          "id": 432,
+          "logprob": -0.2939453,
           "special": false,
-          "text": "("
+          "text": " of"
         },
         {
-          "id": 62,
-          "logprob": -0.0005264282,
+          "id": 312,
+          "logprob": -0.31591797,
           "special": false,
-          "text": "L"
+          "text": " a"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
   },
   {
     "details": {
@@ -278,57 +280,57 @@
         },
         {
           "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.5859375,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.25927734,
+          "logprob": -0.26586914,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -2.25,
+          "logprob": -1.6347656,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.30126953,
+          "logprob": -0.22766113,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.7539062,
+          "logprob": -5.2265625,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.0878906,
+          "logprob": -3.0976562,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -0.6845703,
+          "logprob": -1.1025391,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.3918457,
+          "logprob": -0.1427002,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.8798828,
+          "logprob": -0.32592773,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.4980469,
+          "logprob": -2.8164062,
           "text": "]):"
         }
       ],
@@ -336,67 +338,68 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -1.1533203,
+          "logprob": -0.13012695,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 442,
-          "logprob": -0.9165039,
+          "id": 1524,
+          "logprob": -0.98046875,
           "special": false,
-          "text": " return"
+          "text": " \"\"\""
         },
         {
-          "id": 3632,
-          "logprob": -1.328125,
+          "id": 284,
+          "logprob": -0.69921875,
           "special": false,
-          "text": " sum"
+          "text": "\n   "
         },
         {
-          "id": 26,
-          "logprob": -0.07946777,
+          "id": 14883,
+          "logprob": -2.1992188,
           "special": false,
-          "text": "("
+          "text": " Calculate"
         },
         {
-          "id": 62,
-          "logprob": -0.09820557,
+          "id": 322,
+          "logprob": -0.2668457,
           "special": false,
-          "text": "L"
+          "text": " the"
         },
         {
-          "id": 27,
-          "logprob": -0.28930664,
+          "id": 3226,
+          "logprob": -0.083496094,
           "special": false,
-          "text": ")"
+          "text": " ge"
         },
         {
-          "id": 517,
-          "logprob": -0.34592773,
+          "id": 21017,
+          "logprob": -0.01902771,
           "special": false,
-          "text": " /"
+          "text": "ometric"
         },
         {
-          "id": 2069,
-          "logprob": -0.038330078,
+          "id": 5651,
+          "logprob": -0.029006958,
           "special": false,
-          "text": " len"
+          "text": " mean"
         },
         {
-          "id": 26,
-          "logprob": -0.0011940002,
+          "id": 432,
+          "logprob": -0.29248047,
           "special": false,
-          "text": "("
+          "text": " of"
         },
         {
-          "id": 62,
-          "logprob": -0.00050878525,
+          "id": 312,
+          "logprob": -0.3161621,
           "special": false,
-          "text": "L"
+          "text": " a"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
   },
   {
     "details": {
@@ -411,57 +414,57 @@
         },
         {
           "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
           "text": " ge"
         },
         {
           "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.5859375,
           "text": "ometric"
         },
         {
           "id": 81,
-          "logprob": -0.25927734,
+          "logprob": -0.26904297,
           "text": "_"
         },
         {
           "id": 6009,
-          "logprob": -2.25,
+          "logprob": -1.6386719,
           "text": "mean"
         },
         {
           "id": 26,
-          "logprob": -0.30126953,
+          "logprob": -0.22705078,
           "text": "("
         },
         {
           "id": 62,
-          "logprob": -5.7539062,
+          "logprob": -5.234375,
           "text": "L"
         },
         {
           "id": 44,
-          "logprob": -3.0878906,
+          "logprob": -3.1132812,
           "text": ":"
         },
         {
           "id": 1682,
-          "logprob": -0.6845703,
+          "logprob": -1.1074219,
           "text": " List"
         },
         {
           "id": 77,
-          "logprob": -0.3918457,
+          "logprob": -0.14477539,
           "text": "["
         },
         {
           "id": 1808,
-          "logprob": -0.8798828,
+          "logprob": -0.3256836,
           "text": "float"
         },
         {
           "id": 10794,
-          "logprob": -2.4980469,
+          "logprob": -2.8027344,
           "text": "]):"
         }
       ],
@@ -469,66 +472,67 @@
       "tokens": [
         {
           "id": 284,
-          "logprob": -1.1533203,
+          "logprob": -0.12915039,
           "special": false,
           "text": "\n   "
         },
         {
-          "id": 442,
-          "logprob": -0.91259766,
+          "id": 1524,
+          "logprob": -0.98535156,
           "special": false,
-          "text": " return"
+          "text": " \"\"\""
         },
         {
-          "id": 3632,
-          "logprob": -1.3251953,
+          "id": 284,
+          "logprob": -0.69921875,
           "special": false,
-          "text": " sum"
+          "text": "\n   "
         },
         {
-          "id": 26,
-          "logprob": -0.08062744,
+          "id": 14883,
+          "logprob": -2.2011719,
           "special": false,
-          "text": "("
+          "text": " Calculate"
         },
         {
-          "id": 62,
-          "logprob": -0.09906006,
+          "id": 322,
+          "logprob": -0.26708984,
           "special": false,
-          "text": "L"
+          "text": " the"
         },
         {
-          "id": 27,
-          "logprob": -0.28979492,
+          "id": 3226,
+          "logprob": -0.08502197,
           "special": false,
-          "text": ")"
+          "text": " ge"
         },
         {
-          "id": 517,
-          "logprob": -0.35958984,
+          "id": 21017,
+          "logprob": -0.019012451,
           "special": false,
-          "text": " /"
+          "text": "ometric"
         },
         {
-          "id": 2069,
-          "logprob": -0.038604736,
+          "id": 5651,
+          "logprob": -0.028625488,
           "special": false,
-          "text": " len"
+          "text": " mean"
         },
         {
-          "id": 26,
-          "logprob": -0.0011901855,
+          "id": 432,
+          "logprob": -0.29589844,
           "special": false,
-          "text": "("
+          "text": " of"
         },
         {
-          "id": 62,
-          "logprob": -0.0005078316,
+          "id": 312,
+          "logprob": -0.31591797,
           "special": false,
-          "text": "L"
+          "text": " a"
         }
-      ]
+      ],
+      "top_tokens": null
     },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_grammar_llama/test_non_flash_llama_grammar_json.json b/integration-tests/models/__snapshots__/test_grammar_llama/test_non_flash_llama_grammar_json.json
new file mode 100644
index 00000000..d7fb620d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_grammar_llama/test_non_flash_llama_grammar_json.json
@@ -0,0 +1,274 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 30,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 5235,
+        "logprob": -10.0625,
+        "text": "info"
+      },
+      {
+        "id": 29901,
+        "logprob": -3.2324219,
+        "text": ":"
+      },
+      {
+        "id": 13260,
+        "logprob": -10.625,
+        "text": "dav"
+      },
+      {
+        "id": 333,
+        "logprob": -0.08276367,
+        "text": "id"
+      },
+      {
+        "id": 8753,
+        "logprob": -7.5273438,
+        "text": "hol"
+      },
+      {
+        "id": 17559,
+        "logprob": -3.8476562,
+        "text": "tz"
+      },
+      {
+        "id": 763,
+        "logprob": -10.140625,
+        "text": "like"
+      },
+      {
+        "id": 10697,
+        "logprob": -10.1953125,
+        "text": "trees"
+      },
+      {
+        "id": 322,
+        "logprob": -2.5742188,
+        "text": "and"
+      },
+      {
+        "id": 756,
+        "logprob": -7.4882812,
+        "text": "has"
+      },
+      {
+        "id": 1023,
+        "logprob": -5.0507812,
+        "text": "two"
+      },
+      {
+        "id": 274,
+        "logprob": -5.3164062,
+        "text": "c"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.6694336,
+        "text": "ats"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.9995117,
+        "text": "."
+      },
+      {
+        "id": 29871,
+        "logprob": -4.2421875,
+        "text": ""
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 6377,
+        "logprob": -0.14916992,
+        "special": false,
+        "text": "{\""
+      },
+      {
+        "id": 29888,
+        "logprob": -0.13598633,
+        "special": false,
+        "text": "f"
+      },
+      {
+        "id": 12935,
+        "logprob": -0.017669678,
+        "special": false,
+        "text": "irs"
+      },
+      {
+        "id": 29873,
+        "logprob": -0.00085639954,
+        "special": false,
+        "text": "t"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.0054016113,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.13549805,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 19504,
+        "logprob": -0.8852539,
+        "special": false,
+        "text": "David"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.16394043,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29882,
+        "logprob": -0.08862305,
+        "special": false,
+        "text": "h"
+      },
+      {
+        "id": 711,
+        "logprob": -0.66259766,
+        "special": false,
+        "text": "ob"
+      },
+      {
+        "id": 1609,
+        "logprob": -5.51939e-05,
+        "special": false,
+        "text": "by"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.23120117,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29911,
+        "logprob": -2.3730469,
+        "special": false,
+        "text": "T"
+      },
+      {
+        "id": 11003,
+        "logprob": -0.032104492,
+        "special": false,
+        "text": "rees"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.22021484,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 4230,
+        "logprob": -0.06726074,
+        "special": false,
+        "text": "last"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.003501892,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.0045661926,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29950,
+        "logprob": -0.12512207,
+        "special": false,
+        "text": "H"
+      },
+      {
+        "id": 14339,
+        "logprob": -0.009552002,
+        "special": false,
+        "text": "olt"
+      },
+      {
+        "id": 29920,
+        "logprob": -0.00042438507,
+        "special": false,
+        "text": "z"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.11651611,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29876,
+        "logprob": -0.29736328,
+        "special": false,
+        "text": "n"
+      },
+      {
+        "id": 398,
+        "logprob": -0.003030777,
+        "special": false,
+        "text": "um"
+      },
+      {
+        "id": 29907,
+        "logprob": -0.3774414,
+        "special": false,
+        "text": "C"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.0003130436,
+        "special": false,
+        "text": "ats"
+      },
+      {
+        "id": 1115,
+        "logprob": -0.0021514893,
+        "special": false,
+        "text": "\":"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.071899414,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29913,
+        "logprob": -0.018997192,
+        "special": false,
+        "text": "}"
+      },
+      {
+        "id": 2,
+        "logprob": 0.0,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "{\"firstName\":\"David\",\"hobby\":\"Trees\",\"lastName\":\"Holtz\",\"numCats\":2}"
+}
diff --git a/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
new file mode 100644
index 00000000..83390832
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
@@ -0,0 +1,23 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "{\n  \"temperature\": [\n    35,\n    34,\n    36\n  ],\n  \"unit\": \"°c\"\n}",
+        "role": "assistant"
+      }
+    }
+  ],
+  "created": 1718044128,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.5-dev0-native",
+  "usage": {
+    "completion_tokens": 39,
+    "prompt_tokens": 136,
+    "total_tokens": 175
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
index 0edd81b6..90fb6dcc 100644
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json
@@ -11,92 +11,92 @@
       },
       {
         "id": 4911,
-        "logprob": -5.7773438,
+        "logprob": -6.9765625,
         "text": "User"
       },
       {
         "id": 29901,
-        "logprob": -0.0069999695,
+        "logprob": -0.0059432983,
         "text": ":"
       },
       {
         "id": 32000,
-        "logprob": -0.8125,
+        "logprob": -0.8408203,
         "text": "<fake_token_around_image>"
       },
       {
         "id": 32001,
-        "logprob": -6.651878e-05,
+        "logprob": -9.906292e-05,
         "text": "<image>"
       },
       {
         "id": 32000,
-        "logprob": -3.5762787e-07,
+        "logprob": -2.3841858e-07,
         "text": "<fake_token_around_image>"
       },
       {
         "id": 1815,
-        "logprob": -4.2265625,
+        "logprob": -4.1679688,
         "text": "Can"
       },
       {
         "id": 366,
-        "logprob": -0.013977051,
+        "logprob": -0.014099121,
         "text": "you"
       },
       {
         "id": 2649,
-        "logprob": -4.4375,
+        "logprob": -4.4609375,
         "text": "tell"
       },
       {
         "id": 592,
-        "logprob": -0.29077148,
+        "logprob": -0.29882812,
         "text": "me"
       },
       {
         "id": 263,
-        "logprob": -4.2109375,
+        "logprob": -4.1445312,
         "text": "a"
       },
       {
         "id": 1407,
-        "logprob": -9.4296875,
+        "logprob": -9.3828125,
         "text": "very"
       },
       {
         "id": 3273,
-        "logprob": -1.8671875,
+        "logprob": -1.9736328,
         "text": "short"
       },
       {
         "id": 5828,
-        "logprob": -0.26586914,
+        "logprob": -0.2800293,
         "text": "story"
       },
       {
         "id": 2729,
-        "logprob": -3.7460938,
+        "logprob": -3.5625,
         "text": "based"
       },
       {
         "id": 373,
-        "logprob": -0.0005350113,
+        "logprob": -0.0006427765,
         "text": "on"
       },
       {
         "id": 278,
-        "logprob": -0.13867188,
+        "logprob": -0.13952637,
         "text": "the"
       },
       {
         "id": 1967,
-        "logprob": -0.06842041,
+        "logprob": -0.068115234,
         "text": "image"
       },
       {
         "id": 29973,
-        "logprob": -0.15319824,
+        "logprob": -0.16357422,
         "text": "?"
       }
     ],
@@ -104,13 +104,13 @@
     "tokens": [
       {
         "id": 32002,
-        "logprob": -0.0019445419,
+        "logprob": -0.0026474,
         "special": true,
         "text": "<end_of_utterance>"
       },
       {
         "id": 29871,
-        "logprob": -8.404255e-05,
+        "logprob": -8.547306e-05,
         "special": false,
         "text": " "
       },
@@ -140,30 +140,29 @@
       },
       {
         "id": 319,
-        "logprob": -0.9057617,
+        "logprob": -0.92529297,
         "special": false,
         "text": " A"
       },
       {
         "id": 696,
-        "logprob": -1.2314453,
+        "logprob": -1.1269531,
         "special": false,
         "text": " ro"
       },
       {
         "id": 15664,
-        "logprob": -0.00024914742,
+        "logprob": -0.00029492378,
         "special": false,
         "text": "oster"
       },
       {
         "id": 15028,
-        "logprob": -1.1621094,
+        "logprob": -1.1855469,
         "special": false,
         "text": " stands"
       }
-    ],
-    "top_tokens": null
+    ]
   },
-  "generated_text": "\nAssistant: A rooster stands"
+  "generated_text": " \nAssistant: A rooster stands"
 }
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
index 81cc1b19..21d6161b 100644
--- a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json
@@ -12,92 +12,92 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7773438,
+          "logprob": -6.9804688,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0069999695,
+          "logprob": -0.006122589,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8417969,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.651878e-05,
+          "logprob": -9.918213e-05,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
           "text": "Can"
         },
         {
           "id": 366,
-          "logprob": -0.013977051,
+          "logprob": -0.014091492,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4375,
+          "logprob": -4.4726562,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.29077148,
+          "logprob": -0.2998047,
           "text": "me"
         },
         {
           "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
           "text": "a"
         },
         {
           "id": 1407,
-          "logprob": -9.4296875,
+          "logprob": -9.3828125,
           "text": "very"
         },
         {
           "id": 3273,
-          "logprob": -1.8671875,
+          "logprob": -1.9716797,
           "text": "short"
         },
         {
           "id": 5828,
-          "logprob": -0.26586914,
+          "logprob": -0.27734375,
           "text": "story"
         },
         {
           "id": 2729,
-          "logprob": -3.7460938,
+          "logprob": -3.5605469,
           "text": "based"
         },
         {
           "id": 373,
-          "logprob": -0.0005350113,
+          "logprob": -0.00064468384,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.14160156,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.06842041,
+          "logprob": -0.06915283,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15319824,
+          "logprob": -0.16381836,
           "text": "?"
         }
       ],
@@ -105,25 +105,25 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019445419,
+          "logprob": -0.0026664734,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.416176e-05,
+          "logprob": -8.583069e-05,
           "special": false,
           "text": " "
         },
         {
           "id": 13,
-          "logprob": -1.7881393e-05,
+          "logprob": -1.8119812e-05,
           "special": false,
           "text": "\n"
         },
         {
           "id": 7900,
-          "logprob": -3.0994415e-06,
+          "logprob": -2.9802322e-06,
           "special": false,
           "text": "Ass"
         },
@@ -141,32 +141,31 @@
         },
         {
           "id": 319,
-          "logprob": -0.89941406,
+          "logprob": -0.9301758,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.234375,
+          "logprob": -1.1279297,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.0002465248,
+          "logprob": -0.0002939701,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1660156,
+          "logprob": -1.1865234,
           "special": false,
           "text": " stands"
         }
-      ],
-      "top_tokens": null
+      ]
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   },
   {
     "details": {
@@ -181,92 +180,92 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7890625,
+          "logprob": -6.9804688,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0070152283,
+          "logprob": -0.006122589,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8417969,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.651878e-05,
+          "logprob": -9.942055e-05,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
           "text": "Can"
         },
         {
           "id": 366,
-          "logprob": -0.014190674,
+          "logprob": -0.014091492,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4140625,
+          "logprob": -4.4726562,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.2998047,
           "text": "me"
         },
         {
           "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
           "text": "a"
         },
         {
           "id": 1407,
-          "logprob": -9.4375,
+          "logprob": -9.3828125,
           "text": "very"
         },
         {
           "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
           "text": "short"
         },
         {
           "id": 5828,
-          "logprob": -0.26904297,
+          "logprob": -0.27734375,
           "text": "story"
         },
         {
           "id": 2729,
-          "logprob": -3.7675781,
+          "logprob": -3.5605469,
           "text": "based"
         },
         {
           "id": 373,
-          "logprob": -0.0005402565,
+          "logprob": -0.0006451607,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.14160156,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.068359375,
+          "logprob": -0.06915283,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15539551,
+          "logprob": -0.16381836,
           "text": "?"
         }
       ],
@@ -274,19 +273,19 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019168854,
+          "logprob": -0.0026664734,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.392334e-05,
+          "logprob": -8.571148e-05,
           "special": false,
           "text": " "
         },
         {
           "id": 13,
-          "logprob": -1.7642975e-05,
+          "logprob": -1.8119812e-05,
           "special": false,
           "text": "\n"
         },
@@ -310,32 +309,31 @@
         },
         {
           "id": 319,
-          "logprob": -0.90722656,
+          "logprob": -0.9301758,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.2373047,
+          "logprob": -1.1279297,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.00024938583,
+          "logprob": -0.0002939701,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1708984,
+          "logprob": -1.1865234,
           "special": false,
           "text": " stands"
         }
-      ],
-      "top_tokens": null
+      ]
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   },
   {
     "details": {
@@ -350,92 +348,92 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7890625,
+          "logprob": -6.9804688,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0070152283,
+          "logprob": -0.006122589,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8417969,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.663799e-05,
+          "logprob": -9.918213e-05,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
           "text": "Can"
         },
         {
           "id": 366,
-          "logprob": -0.014190674,
+          "logprob": -0.014091492,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4140625,
+          "logprob": -4.4726562,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.2998047,
           "text": "me"
         },
         {
           "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
           "text": "a"
         },
         {
           "id": 1407,
-          "logprob": -9.4375,
+          "logprob": -9.3828125,
           "text": "very"
         },
         {
           "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
           "text": "short"
         },
         {
           "id": 5828,
-          "logprob": -0.26904297,
+          "logprob": -0.27734375,
           "text": "story"
         },
         {
           "id": 2729,
-          "logprob": -3.7675781,
+          "logprob": -3.5605469,
           "text": "based"
         },
         {
           "id": 373,
-          "logprob": -0.0005402565,
+          "logprob": -0.00064468384,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.14160156,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.068359375,
+          "logprob": -0.06915283,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15539551,
+          "logprob": -0.16381836,
           "text": "?"
         }
       ],
@@ -443,19 +441,19 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019168854,
+          "logprob": -0.0026664734,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.404255e-05,
+          "logprob": -8.59499e-05,
           "special": false,
           "text": " "
         },
         {
           "id": 13,
-          "logprob": -1.7642975e-05,
+          "logprob": -1.8119812e-05,
           "special": false,
           "text": "\n"
         },
@@ -479,32 +477,31 @@
         },
         {
           "id": 319,
-          "logprob": -0.90722656,
+          "logprob": -0.9301758,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.2373047,
+          "logprob": -1.1279297,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.00024938583,
+          "logprob": -0.0002939701,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1708984,
+          "logprob": -1.1865234,
           "special": false,
           "text": " stands"
         }
-      ],
-      "top_tokens": null
+      ]
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   },
   {
     "details": {
@@ -519,92 +516,92 @@
         },
         {
           "id": 4911,
-          "logprob": -5.7890625,
+          "logprob": -6.9804688,
           "text": "User"
         },
         {
           "id": 29901,
-          "logprob": -0.0070152283,
+          "logprob": -0.006122589,
           "text": ":"
         },
         {
           "id": 32000,
-          "logprob": -0.8125,
+          "logprob": -0.8417969,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 32001,
-          "logprob": -6.663799e-05,
+          "logprob": -9.942055e-05,
           "text": "<image>"
         },
         {
           "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
           "text": "<fake_token_around_image>"
         },
         {
           "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
           "text": "Can"
         },
         {
           "id": 366,
-          "logprob": -0.014190674,
+          "logprob": -0.014091492,
           "text": "you"
         },
         {
           "id": 2649,
-          "logprob": -4.4140625,
+          "logprob": -4.4726562,
           "text": "tell"
         },
         {
           "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.2998047,
           "text": "me"
         },
         {
           "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
           "text": "a"
         },
         {
           "id": 1407,
-          "logprob": -9.4375,
+          "logprob": -9.3828125,
           "text": "very"
         },
         {
           "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
           "text": "short"
         },
         {
           "id": 5828,
-          "logprob": -0.26904297,
+          "logprob": -0.27734375,
           "text": "story"
         },
         {
           "id": 2729,
-          "logprob": -3.7675781,
+          "logprob": -3.5605469,
           "text": "based"
         },
         {
           "id": 373,
-          "logprob": -0.0005402565,
+          "logprob": -0.0006451607,
           "text": "on"
         },
         {
           "id": 278,
-          "logprob": -0.13867188,
+          "logprob": -0.14160156,
           "text": "the"
         },
         {
           "id": 1967,
-          "logprob": -0.068359375,
+          "logprob": -0.06915283,
           "text": "image"
         },
         {
           "id": 29973,
-          "logprob": -0.15539551,
+          "logprob": -0.16381836,
           "text": "?"
         }
       ],
@@ -612,19 +609,19 @@
       "tokens": [
         {
           "id": 32002,
-          "logprob": -0.0019159317,
+          "logprob": -0.0026664734,
           "special": true,
           "text": "<end_of_utterance>"
         },
         {
           "id": 29871,
-          "logprob": -8.404255e-05,
+          "logprob": -8.571148e-05,
           "special": false,
           "text": " "
         },
         {
           "id": 13,
-          "logprob": -1.7642975e-05,
+          "logprob": -1.8119812e-05,
           "special": false,
           "text": "\n"
         },
@@ -648,31 +645,30 @@
         },
         {
           "id": 319,
-          "logprob": -0.90722656,
+          "logprob": -0.9301758,
           "special": false,
           "text": " A"
         },
         {
           "id": 696,
-          "logprob": -1.2373047,
+          "logprob": -1.1279297,
           "special": false,
           "text": " ro"
         },
         {
           "id": 15664,
-          "logprob": -0.00024938583,
+          "logprob": -0.0002939701,
           "special": false,
           "text": "oster"
         },
         {
           "id": 15028,
-          "logprob": -1.1708984,
+          "logprob": -1.1865234,
           "special": false,
           "text": " stands"
         }
-      ],
-      "top_tokens": null
+      ]
     },
-    "generated_text": "\nAssistant: A rooster stands"
+    "generated_text": " \nAssistant: A rooster stands"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json
new file mode 100644
index 00000000..a4727707
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json
@@ -0,0 +1,85 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 12,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 450,
+        "logprob": -0.26342773,
+        "special": false,
+        "text": " The"
+      },
+      {
+        "id": 21282,
+        "logprob": -0.01838684,
+        "special": false,
+        "text": " cow"
+      },
+      {
+        "id": 322,
+        "logprob": -0.18041992,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 521,
+        "logprob": -0.62841797,
+        "special": false,
+        "text": " ch"
+      },
+      {
+        "id": 21475,
+        "logprob": -0.0037956238,
+        "special": false,
+        "text": "icken"
+      },
+      {
+        "id": 526,
+        "logprob": -0.018737793,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 373,
+        "logprob": -1.0820312,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 263,
+        "logprob": -0.5083008,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 25695,
+        "logprob": -0.07128906,
+        "special": false,
+        "text": " beach"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.12573242,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 32002,
+        "logprob": -0.0029792786,
+        "special": true,
+        "text": "<end_of_utterance>"
+      },
+      {
+        "id": 2,
+        "logprob": -0.00024962425,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " The cow and chicken are on a beach."
+}
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json
new file mode 100644
index 00000000..45601505
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -8.5625,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -10.78125,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 288,
+        "logprob": -0.2854004,
+        "special": false,
+        "text": "ing"
+      },
+      {
+        "id": 264,
+        "logprob": -0.37573242,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 633,
+        "logprob": -0.09301758,
+        "special": false,
+        "text": " new"
+      },
+      {
+        "id": 4480,
+        "logprob": -0.3322754,
+        "special": false,
+        "text": " feature"
+      },
+      {
+        "id": 297,
+        "logprob": -0.8510742,
+        "special": false,
+        "text": " in"
+      },
+      {
+        "id": 272,
+        "logprob": -0.13464355,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2039,
+        "logprob": 0.0,
+        "special": false,
+        "text": " game"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.89990234,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test requesting a new feature in the game.\n\n"
+}
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_load.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_load.json
new file mode 100644
index 00000000..7f1875e0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_load.json
@@ -0,0 +1,7098 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -5.2421875,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -6.9570312,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.015625,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.2988281,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.7207031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.0917969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.0332031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 12018,
+          "logprob": -12.078125,
+          "text": "Write"
+        },
+        {
+          "id": 528,
+          "logprob": -10.09375,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -0.103393555,
+          "text": "a"
+        },
+        {
+          "id": 2485,
+          "logprob": -4.5742188,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.23815918,
+          "text": "story"
+        },
+        {
+          "id": 32002,
+          "logprob": -10.9765625,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 259,
+          "logprob": -20.34375,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -8.53125,
+          "text": "\n"
+        },
+        {
+          "id": 7226,
+          "logprob": -10.4765625,
+          "text": "Ass"
+        },
+        {
+          "id": 11143,
+          "logprob": -13.6015625,
+          "text": "istant"
+        },
+        {
+          "id": 28747,
+          "logprob": -0.008514404,
+          "text": ":"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 330,
+          "logprob": -0.09289551,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 13088,
+          "logprob": -0.6743164,
+          "special": false,
+          "text": " chicken"
+        },
+        {
+          "id": 349,
+          "logprob": -0.31396484,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 6398,
+          "logprob": -0.051727295,
+          "special": false,
+          "text": " sitting"
+        },
+        {
+          "id": 356,
+          "logprob": -0.34448242,
+          "special": false,
+          "text": " on"
+        },
+        {
+          "id": 264,
+          "logprob": -0.1194458,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 17972,
+          "logprob": -0.03237915,
+          "special": false,
+          "text": " pile"
+        },
+        {
+          "id": 302,
+          "logprob": -0.00018751621,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2445,
+          "logprob": -0.07043457,
+          "special": false,
+          "text": " money"
+        },
+        {
+          "id": 28723,
+          "logprob": -0.00422287,
+          "special": false,
+          "text": "."
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " A chicken is sitting on a pile of money."
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -5.2382812,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -6.9492188,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.015625,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.2988281,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.7207031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.0917969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.0332031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 12018,
+          "logprob": -12.078125,
+          "text": "Write"
+        },
+        {
+          "id": 528,
+          "logprob": -10.109375,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -0.103515625,
+          "text": "a"
+        },
+        {
+          "id": 2485,
+          "logprob": -4.5664062,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.23864746,
+          "text": "story"
+        },
+        {
+          "id": 32002,
+          "logprob": -10.9609375,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 259,
+          "logprob": -20.34375,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -8.5546875,
+          "text": "\n"
+        },
+        {
+          "id": 7226,
+          "logprob": -10.484375,
+          "text": "Ass"
+        },
+        {
+          "id": 11143,
+          "logprob": -13.6015625,
+          "text": "istant"
+        },
+        {
+          "id": 28747,
+          "logprob": -0.008308411,
+          "text": ":"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 330,
+          "logprob": -0.09448242,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 13088,
+          "logprob": -0.6743164,
+          "special": false,
+          "text": " chicken"
+        },
+        {
+          "id": 349,
+          "logprob": -0.31201172,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 6398,
+          "logprob": -0.051635742,
+          "special": false,
+          "text": " sitting"
+        },
+        {
+          "id": 356,
+          "logprob": -0.34033203,
+          "special": false,
+          "text": " on"
+        },
+        {
+          "id": 264,
+          "logprob": -0.1194458,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 17972,
+          "logprob": -0.032562256,
+          "special": false,
+          "text": " pile"
+        },
+        {
+          "id": 302,
+          "logprob": -0.00018763542,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2445,
+          "logprob": -0.07122803,
+          "special": false,
+          "text": " money"
+        },
+        {
+          "id": 28723,
+          "logprob": -0.0041007996,
+          "special": false,
+          "text": "."
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " A chicken is sitting on a pile of money."
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -5.2382812,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -6.9492188,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.015625,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.2988281,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.7207031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.0917969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.0332031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 12018,
+          "logprob": -12.078125,
+          "text": "Write"
+        },
+        {
+          "id": 528,
+          "logprob": -10.109375,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -0.103515625,
+          "text": "a"
+        },
+        {
+          "id": 2485,
+          "logprob": -4.5664062,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.23864746,
+          "text": "story"
+        },
+        {
+          "id": 32002,
+          "logprob": -10.9609375,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 259,
+          "logprob": -20.34375,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -8.5546875,
+          "text": "\n"
+        },
+        {
+          "id": 7226,
+          "logprob": -10.484375,
+          "text": "Ass"
+        },
+        {
+          "id": 11143,
+          "logprob": -13.6015625,
+          "text": "istant"
+        },
+        {
+          "id": 28747,
+          "logprob": -0.008308411,
+          "text": ":"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 330,
+          "logprob": -0.09448242,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 13088,
+          "logprob": -0.6743164,
+          "special": false,
+          "text": " chicken"
+        },
+        {
+          "id": 349,
+          "logprob": -0.31201172,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 6398,
+          "logprob": -0.051635742,
+          "special": false,
+          "text": " sitting"
+        },
+        {
+          "id": 356,
+          "logprob": -0.34033203,
+          "special": false,
+          "text": " on"
+        },
+        {
+          "id": 264,
+          "logprob": -0.1194458,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 17972,
+          "logprob": -0.032562256,
+          "special": false,
+          "text": " pile"
+        },
+        {
+          "id": 302,
+          "logprob": -0.00018787384,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2445,
+          "logprob": -0.07122803,
+          "special": false,
+          "text": " money"
+        },
+        {
+          "id": 28723,
+          "logprob": -0.0041007996,
+          "special": false,
+          "text": "."
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " A chicken is sitting on a pile of money."
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -5.2382812,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -6.9492188,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.015625,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.2988281,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.7207031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -23.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -3.0917969,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -25.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -22.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -18.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -21.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -16.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -17.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -20.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -19.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32001,
+          "logprob": -15.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -2.0332031,
+          "text": "<fake_token_around_image>"
+        },
+        {
+          "id": 12018,
+          "logprob": -12.078125,
+          "text": "Write"
+        },
+        {
+          "id": 528,
+          "logprob": -10.109375,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -0.103515625,
+          "text": "a"
+        },
+        {
+          "id": 2485,
+          "logprob": -4.5664062,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.23864746,
+          "text": "story"
+        },
+        {
+          "id": 32002,
+          "logprob": -10.9609375,
+          "text": "<end_of_utterance>"
+        },
+        {
+          "id": 259,
+          "logprob": -20.34375,
+          "text": " "
+        },
+        {
+          "id": 13,
+          "logprob": -8.5546875,
+          "text": "\n"
+        },
+        {
+          "id": 7226,
+          "logprob": -10.484375,
+          "text": "Ass"
+        },
+        {
+          "id": 11143,
+          "logprob": -13.6015625,
+          "text": "istant"
+        },
+        {
+          "id": 28747,
+          "logprob": -0.008308411,
+          "text": ":"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 330,
+          "logprob": -0.09448242,
+          "special": false,
+          "text": " A"
+        },
+        {
+          "id": 13088,
+          "logprob": -0.6743164,
+          "special": false,
+          "text": " chicken"
+        },
+        {
+          "id": 349,
+          "logprob": -0.31201172,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 6398,
+          "logprob": -0.051635742,
+          "special": false,
+          "text": " sitting"
+        },
+        {
+          "id": 356,
+          "logprob": -0.34033203,
+          "special": false,
+          "text": " on"
+        },
+        {
+          "id": 264,
+          "logprob": -0.1194458,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 17972,
+          "logprob": -0.032562256,
+          "special": false,
+          "text": " pile"
+        },
+        {
+          "id": 302,
+          "logprob": -0.00018763542,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 2445,
+          "logprob": -0.07122803,
+          "special": false,
+          "text": " money"
+        },
+        {
+          "id": 28723,
+          "logprob": -0.0041007996,
+          "special": false,
+          "text": "."
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " A chicken is sitting on a pile of money."
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_simple.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_simple.json
new file mode 100644
index 00000000..da2ac897
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_next_simple.json
@@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 330,
+        "logprob": -0.08660889,
+        "special": false,
+        "text": " A"
+      },
+      {
+        "id": 13088,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " chicken"
+      },
+      {
+        "id": 349,
+        "logprob": -0.32885742,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 6398,
+        "logprob": -0.05126953,
+        "special": false,
+        "text": " sitting"
+      },
+      {
+        "id": 356,
+        "logprob": -0.35229492,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 264,
+        "logprob": -0.12561035,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 17972,
+        "logprob": -0.038085938,
+        "special": false,
+        "text": " pile"
+      },
+      {
+        "id": 302,
+        "logprob": -0.00018656254,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 2445,
+        "logprob": -0.07293701,
+        "special": false,
+        "text": " money"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.004852295,
+        "special": false,
+        "text": "."
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " A chicken is sitting on a pile of money."
+}
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
new file mode 100644
index 00000000..44ccea71
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
@@ -0,0 +1,127 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 19,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 415,
+        "logprob": -0.03665161,
+        "special": false,
+        "text": " The"
+      },
+      {
+        "id": 12072,
+        "logprob": -0.13549805,
+        "special": false,
+        "text": " cow"
+      },
+      {
+        "id": 349,
+        "logprob": -0.05819702,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 6328,
+        "logprob": -0.6826172,
+        "special": false,
+        "text": " standing"
+      },
+      {
+        "id": 356,
+        "logprob": -0.1607666,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 272,
+        "logprob": -0.5073242,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 10305,
+        "logprob": -0.016418457,
+        "special": false,
+        "text": " beach"
+      },
+      {
+        "id": 304,
+        "logprob": -1.3916016,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 272,
+        "logprob": -0.020217896,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 13088,
+        "logprob": -0.0028133392,
+        "special": false,
+        "text": " chicken"
+      },
+      {
+        "id": 349,
+        "logprob": -0.003145218,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 6398,
+        "logprob": -0.37060547,
+        "special": false,
+        "text": " sitting"
+      },
+      {
+        "id": 356,
+        "logprob": -0.034851074,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 264,
+        "logprob": -0.2878418,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 17972,
+        "logprob": -0.046051025,
+        "special": false,
+        "text": " pile"
+      },
+      {
+        "id": 302,
+        "logprob": -0.00028848648,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 2445,
+        "logprob": -0.025772095,
+        "special": false,
+        "text": " money"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.018127441,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 32002,
+        "logprob": -0.00019824505,
+        "special": true,
+        "text": "<end_of_utterance>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " The cow is standing on the beach and the chicken is sitting on a pile of money."
+}
diff --git a/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_all_params.json b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_all_params.json
new file mode 100644
index 00000000..e9d3e5ef
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_all_params.json
@@ -0,0 +1,65 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "stop_sequence",
+    "generated_tokens": 6,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -10.5,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -12.140625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.0654297,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 1014,
+        "logprob": -2.7460938,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 6032,
+        "logprob": -1.359375,
+        "special": false,
+        "text": " purpose"
+      },
+      {
+        "id": 302,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 456,
+        "logprob": 0.0,
+        "special": false,
+        "text": " this"
+      },
+      {
+        "id": 1369,
+        "logprob": -0.40063477,
+        "special": false,
+        "text": " test"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request\nThe purpose of this test"
+}
diff --git a/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
new file mode 100644
index 00000000..2007c0f2
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_load.json
@@ -0,0 +1,59178 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -2.3886719,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -12.328125,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 2418,
+          "logprob": -19.0625,
+          "text": "Can"
+        },
+        {
+          "id": 368,
+          "logprob": -0.19726562,
+          "text": "you"
+        },
+        {
+          "id": 1912,
+          "logprob": -1.4990234,
+          "text": "tell"
+        },
+        {
+          "id": 528,
+          "logprob": -0.31152344,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -2.6367188,
+          "text": "a"
+        },
+        {
+          "id": 1215,
+          "logprob": -9.1015625,
+          "text": "very"
+        },
+        {
+          "id": 2485,
+          "logprob": -0.9941406,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.46118164,
+          "text": "story"
+        },
+        {
+          "id": 2818,
+          "logprob": -3.3183594,
+          "text": "based"
+        },
+        {
+          "id": 356,
+          "logprob": -0.029129028,
+          "text": "on"
+        },
+        {
+          "id": 272,
+          "logprob": -0.9902344,
+          "text": "the"
+        },
+        {
+          "id": 3469,
+          "logprob": -0.29052734,
+          "text": "image"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.43188477,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.0076828003,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.20092773,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 16114,
+          "logprob": -1.2587891,
+          "special": false,
+          "text": "Once"
+        },
+        {
+          "id": 3714,
+          "logprob": -0.20861816,
+          "special": false,
+          "text": " upon"
+        },
+        {
+          "id": 264,
+          "logprob": -0.0017719269,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 727,
+          "logprob": -0.011909485,
+          "special": false,
+          "text": " time"
+        },
+        {
+          "id": 28725,
+          "logprob": -0.17529297,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 736,
+          "logprob": -0.9082031,
+          "special": false,
+          "text": " there"
+        },
+        {
+          "id": 403,
+          "logprob": -0.057525635,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 264,
+          "logprob": -0.009651184,
+          "special": false,
+          "text": " a"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nOnce upon a time, there was a"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -2.3886719,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -12.328125,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 2418,
+          "logprob": -19.0625,
+          "text": "Can"
+        },
+        {
+          "id": 368,
+          "logprob": -0.19726562,
+          "text": "you"
+        },
+        {
+          "id": 1912,
+          "logprob": -1.4990234,
+          "text": "tell"
+        },
+        {
+          "id": 528,
+          "logprob": -0.31152344,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -2.6367188,
+          "text": "a"
+        },
+        {
+          "id": 1215,
+          "logprob": -9.1015625,
+          "text": "very"
+        },
+        {
+          "id": 2485,
+          "logprob": -0.9941406,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.46118164,
+          "text": "story"
+        },
+        {
+          "id": 2818,
+          "logprob": -3.3183594,
+          "text": "based"
+        },
+        {
+          "id": 356,
+          "logprob": -0.029129028,
+          "text": "on"
+        },
+        {
+          "id": 272,
+          "logprob": -0.9902344,
+          "text": "the"
+        },
+        {
+          "id": 3469,
+          "logprob": -0.29052734,
+          "text": "image"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.43188477,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.0076828003,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.19958496,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 16114,
+          "logprob": -1.2587891,
+          "special": false,
+          "text": "Once"
+        },
+        {
+          "id": 3714,
+          "logprob": -0.20861816,
+          "special": false,
+          "text": " upon"
+        },
+        {
+          "id": 264,
+          "logprob": -0.0017719269,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 727,
+          "logprob": -0.011749268,
+          "special": false,
+          "text": " time"
+        },
+        {
+          "id": 28725,
+          "logprob": -0.17529297,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 736,
+          "logprob": -0.9086914,
+          "special": false,
+          "text": " there"
+        },
+        {
+          "id": 403,
+          "logprob": -0.056732178,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 264,
+          "logprob": -0.00970459,
+          "special": false,
+          "text": " a"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nOnce upon a time, there was a"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -2.3886719,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -12.328125,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 2418,
+          "logprob": -19.0625,
+          "text": "Can"
+        },
+        {
+          "id": 368,
+          "logprob": -0.19726562,
+          "text": "you"
+        },
+        {
+          "id": 1912,
+          "logprob": -1.4990234,
+          "text": "tell"
+        },
+        {
+          "id": 528,
+          "logprob": -0.31152344,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -2.6367188,
+          "text": "a"
+        },
+        {
+          "id": 1215,
+          "logprob": -9.1015625,
+          "text": "very"
+        },
+        {
+          "id": 2485,
+          "logprob": -0.9941406,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.46118164,
+          "text": "story"
+        },
+        {
+          "id": 2818,
+          "logprob": -3.3183594,
+          "text": "based"
+        },
+        {
+          "id": 356,
+          "logprob": -0.029129028,
+          "text": "on"
+        },
+        {
+          "id": 272,
+          "logprob": -0.9902344,
+          "text": "the"
+        },
+        {
+          "id": 3469,
+          "logprob": -0.29052734,
+          "text": "image"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.43188477,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.0076828003,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.20092773,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 16114,
+          "logprob": -1.2587891,
+          "special": false,
+          "text": "Once"
+        },
+        {
+          "id": 3714,
+          "logprob": -0.20861816,
+          "special": false,
+          "text": " upon"
+        },
+        {
+          "id": 264,
+          "logprob": -0.0017719269,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 727,
+          "logprob": -0.011909485,
+          "special": false,
+          "text": " time"
+        },
+        {
+          "id": 28725,
+          "logprob": -0.17529297,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 736,
+          "logprob": -0.9082031,
+          "special": false,
+          "text": " there"
+        },
+        {
+          "id": 403,
+          "logprob": -0.057525635,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 264,
+          "logprob": -0.009651184,
+          "special": false,
+          "text": " a"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nOnce upon a time, there was a"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1247,
+          "logprob": -2.3886719,
+          "text": "User"
+        },
+        {
+          "id": 28747,
+          "logprob": -12.328125,
+          "text": ":"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -9.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.21875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.46875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.8671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.25,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.2421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.53125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.6953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1640625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.84375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -18.0,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.15625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6015625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.1171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.1328125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -19.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.859375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.4921875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.28125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.96875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.1484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.1796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.2890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.6171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.75,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.5625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9453125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -10.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6484375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5703125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.1953125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.7421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.09375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.0546875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.59375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.3515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.90625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.671875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.2265625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.78125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.6875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.796875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.03125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.515625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.7734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.4609375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.171875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.4375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.2734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.3984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.578125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.3359375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.984375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.421875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.34375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.8828125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.890625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.3203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.2109375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9765625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.0078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.0390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -14.40625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.046875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.8203125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.5078125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.734375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -13.390625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.3125,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5234375,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -17.625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -11.9296875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.71875,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -15.9140625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -16.65625,
+          "text": "<image>"
+        },
+        {
+          "id": 32000,
+          "logprob": -12.5,
+          "text": "<image>"
+        },
+        {
+          "id": 2418,
+          "logprob": -19.0625,
+          "text": "Can"
+        },
+        {
+          "id": 368,
+          "logprob": -0.19726562,
+          "text": "you"
+        },
+        {
+          "id": 1912,
+          "logprob": -1.4990234,
+          "text": "tell"
+        },
+        {
+          "id": 528,
+          "logprob": -0.31152344,
+          "text": "me"
+        },
+        {
+          "id": 264,
+          "logprob": -2.6367188,
+          "text": "a"
+        },
+        {
+          "id": 1215,
+          "logprob": -9.1015625,
+          "text": "very"
+        },
+        {
+          "id": 2485,
+          "logprob": -0.9941406,
+          "text": "short"
+        },
+        {
+          "id": 2838,
+          "logprob": -0.46118164,
+          "text": "story"
+        },
+        {
+          "id": 2818,
+          "logprob": -3.3183594,
+          "text": "based"
+        },
+        {
+          "id": 356,
+          "logprob": -0.029129028,
+          "text": "on"
+        },
+        {
+          "id": 272,
+          "logprob": -0.9902344,
+          "text": "the"
+        },
+        {
+          "id": 3469,
+          "logprob": -0.29052734,
+          "text": "image"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.43188477,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -0.0076828003,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 13,
+          "logprob": -0.19958496,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 16114,
+          "logprob": -1.2587891,
+          "special": false,
+          "text": "Once"
+        },
+        {
+          "id": 3714,
+          "logprob": -0.20861816,
+          "special": false,
+          "text": " upon"
+        },
+        {
+          "id": 264,
+          "logprob": -0.0017719269,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 727,
+          "logprob": -0.011749268,
+          "special": false,
+          "text": " time"
+        },
+        {
+          "id": 28725,
+          "logprob": -0.17529297,
+          "special": false,
+          "text": ","
+        },
+        {
+          "id": 736,
+          "logprob": -0.9086914,
+          "special": false,
+          "text": " there"
+        },
+        {
+          "id": 403,
+          "logprob": -0.056732178,
+          "special": false,
+          "text": " was"
+        },
+        {
+          "id": 264,
+          "logprob": -0.00970459,
+          "special": false,
+          "text": " a"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nOnce upon a time, there was a"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_simple.json b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_simple.json
new file mode 100644
index 00000000..f0f2ee9e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_llava_next/test_flash_llava_next_simple.json
@@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.00756073,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 16114,
+        "logprob": -1.2597656,
+        "special": false,
+        "text": "Once"
+      },
+      {
+        "id": 3714,
+        "logprob": -0.20825195,
+        "special": false,
+        "text": " upon"
+      },
+      {
+        "id": 264,
+        "logprob": -0.00178051,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 727,
+        "logprob": -0.011955261,
+        "special": false,
+        "text": " time"
+      },
+      {
+        "id": 28725,
+        "logprob": -0.17541504,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 736,
+        "logprob": -0.91308594,
+        "special": false,
+        "text": " there"
+      },
+      {
+        "id": 403,
+        "logprob": -0.058410645,
+        "special": false,
+        "text": " was"
+      },
+      {
+        "id": 264,
+        "logprob": -0.009689331,
+        "special": false,
+        "text": " a"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nOnce upon a time, there was a"
+}
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
new file mode 100644
index 00000000..eaba5078
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba.json
@@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 187,
+        "logprob": -0.37890625,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 187,
+        "logprob": -0.26953125,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 30763,
+        "logprob": -1.1953125,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 4715,
+        "logprob": -0.53515625,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 310,
+        "logprob": -0.625,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 247,
+        "logprob": -0.6796875,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 747,
+        "logprob": -2.0,
+        "special": false,
+        "text": " new"
+      },
+      {
+        "id": 1511,
+        "logprob": -2.3125,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 273,
+        "logprob": -0.0028533936,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5145,
+        "logprob": -1.265625,
+        "special": false,
+        "text": " machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a new type of machine"
+}
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
new file mode 100644
index 00000000..85e9a9e0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba_all_params.json
@@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2502,
+        "logprob": null,
+        "text": " red"
+      },
+      {
+        "id": 13,
+        "logprob": -2.734375,
+        "text": ","
+      },
+      {
+        "id": 8862,
+        "logprob": -3.6875,
+        "text": " yellow"
+      },
+      {
+        "id": 13,
+        "logprob": -0.40234375,
+        "text": ","
+      },
+      {
+        "id": 209,
+        "logprob": -8.25,
+        "text": " "
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 187,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 395,
+        "logprob": -0.3125,
+        "special": false,
+        "text": "and"
+      },
+      {
+        "id": 4797,
+        "logprob": 0.0,
+        "special": false,
+        "text": " blue"
+      },
+      {
+        "id": 9830,
+        "logprob": -1.65625,
+        "special": false,
+        "text": " colors"
+      },
+      {
+        "id": 15,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 329,
+        "logprob": -2.4375,
+        "special": false,
+        "text": " A"
+      },
+      {
+        "id": 1180,
+        "logprob": -1.953125,
+        "special": false,
+        "text": " number"
+      },
+      {
+        "id": 273,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 1027,
+        "logprob": -1.5546875,
+        "special": false,
+        "text": " different"
+      },
+      {
+        "id": 3295,
+        "logprob": -0.97265625,
+        "special": false,
+        "text": " color"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "blue, red, yellow, \nand blue colors. A number of different color"
+}
diff --git a/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json b/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json
new file mode 100644
index 00000000..4921c14b
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_mamba/test_mamba_load.json
@@ -0,0 +1,398 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.83984375,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -12.8125,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.84375,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.25,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.37890625,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.4296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.078125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.515625,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.6015625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.65625,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.109375,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.328125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0032653809,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.28125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.5546875,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.62890625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.64453125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.078125,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.28125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.3125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.5546875,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.62890625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.64453125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.078125,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.28125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.3125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.5546875,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.62890625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.64453125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.078125,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.28125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.3125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
index 024823d0..5cacf3e9 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -1,8 +1,8 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "eos_token",
-    "generated_tokens": 9,
+    "finish_reason": "length",
+    "generated_tokens": 10,
     "prefill": [
       {
         "id": 0,
@@ -14,7 +14,7 @@
     "tokens": [
       {
         "id": 16017,
-        "logprob": -0.30908203,
+        "logprob": 0.0,
         "special": false,
         "text": " blue"
       },
@@ -26,39 +26,45 @@
       },
       {
         "id": 259,
-        "logprob": -0.28271484,
+        "logprob": -0.4716797,
         "special": false,
         "text": " "
       },
       {
-        "id": 15484,
-        "logprob": -1.7929688,
+        "id": 261,
+        "logprob": -0.044677734,
         "special": false,
-        "text": "appear"
+        "text": ","
       },
       {
-        "id": 345,
-        "logprob": -0.8935547,
+        "id": 35622,
+        "logprob": -0.79589844,
         "special": false,
-        "text": "ed"
+        "text": " cloud"
       },
       {
-        "id": 281,
+        "id": 263,
+        "logprob": -1.2958984,
+        "special": false,
+        "text": "s"
+      },
+      {
+        "id": 305,
         "logprob": 0.0,
         "special": false,
-        "text": " in"
+        "text": " and"
       },
       {
-        "id": 287,
+        "id": 35622,
+        "logprob": -1.1630859,
+        "special": false,
+        "text": " cloud"
+      },
+      {
+        "id": 263,
         "logprob": 0.0,
         "special": false,
-        "text": " the"
-      },
-      {
-        "id": 20495,
-        "logprob": -0.32299805,
-        "special": false,
-        "text": " sky"
+        "text": "s"
       },
       {
         "id": 1,
@@ -66,7 +72,8 @@
         "special": true,
         "text": "</s>"
       }
-    ]
+    ],
+    "top_tokens": null
   },
-  "generated_text": "Why is the sky blue?blue sky appeared in the sky"
+  "generated_text": "Why is the sky blue?blue sky, clouds and clouds"
 }
diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
new file mode 100644
index 00000000..69c1f47d
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.8359375,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.6171875,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.3417969,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.8730469,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -1.2626953,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -1.7060547,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -1.4482422,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.15246582,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.796875,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -0.22766113,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.007045746,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.021759033,
+        "special": false,
+        "text": "\n"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\nTest request\nTest request\nTest request\n"
+}
diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
new file mode 100644
index 00000000..9b5ee9ee
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -9.7890625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -9.625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 29899,
+        "logprob": -1.4980469,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 1454,
+        "logprob": -0.19433594,
+        "special": false,
+        "text": "for"
+      },
+      {
+        "id": 29899,
+        "logprob": 0.0,
+        "special": false,
+        "text": "-"
+      },
+      {
+        "id": 9342,
+        "logprob": 0.0,
+        "special": false,
+        "text": "comment"
+      },
+      {
+        "id": 29901,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 396,
+        "logprob": -0.27392578,
+        "special": false,
+        "text": " #"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.49389648,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29900,
+        "logprob": -0.81103516,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 29896,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29955,
+        "logprob": -1.0800781,
+        "special": false,
+        "text": "7"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request-for-comment: #2017"
+}
diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
new file mode 100644
index 00000000..df975635
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.8828125,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.5859375,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8623047,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2451172,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.6923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4492188,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.15197754,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.8022461,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22583008,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007095337,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021652222,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.796875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3476562,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8789062,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2734375,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.703125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4677734,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.15454102,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.7973633,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.23278809,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.006980896,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.022033691,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.9296875,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.5703125,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8486328,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2480469,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.7060547,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4511719,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.1529541,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.81396484,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22180176,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007133484,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021835327,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -9.84375,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -9.6171875,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -2.3261719,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.8691406,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -1.2597656,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -1.7070312,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -1.4550781,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.1538086,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.79345703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22924805,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.0070266724,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021942139,
+          "special": false,
+          "text": "\n"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\nTest request\nTest request\nTest request\n"
+  }
+]
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
new file mode 100644
index 00000000..a4c34a10
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json
@@ -0,0 +1,39 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": {
+                "format": "celsius",
+                "location": "Brooklyn"
+              },
+              "description": null,
+              "name": "get_current_weather"
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1712782670,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 37,
+    "prompt_tokens": 524,
+    "total_tokens": 561
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
new file mode 100644
index 00000000..04bcdc4e
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@@ -0,0 +1,39 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": {
+                "format": "celsius",
+                "location": "Brooklyn"
+              },
+              "description": null,
+              "name": "get_current_weather"
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1712787937,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 37,
+    "prompt_tokens": 524,
+    "total_tokens": 561
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
new file mode 100644
index 00000000..603c90af
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@@ -0,0 +1,39 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": {
+                "format": "celsius",
+                "location": "New York, NY"
+              },
+              "description": null,
+              "name": "get_current_weather"
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1712852394,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native",
+  "usage": {
+    "completion_tokens": 48,
+    "prompt_tokens": 320,
+    "total_tokens": 368
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
new file mode 100644
index 00000000..0cd3c67f
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_insufficient_information.json
@@ -0,0 +1,38 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "arguments": {
+                "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
+              },
+              "description": null,
+              "name": "notify_error"
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1712852597,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "1.4.5-native",
+  "usage": {
+    "completion_tokens": 39,
+    "prompt_tokens": 496,
+    "total_tokens": 535
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
new file mode 100644
index 00000000..f72a5d38
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@@ -0,0 +1,27 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": null,
+        "role": "assistant",
+        "tool_calls": {
+          "function": {
+            "arguments": "</s>",
+            "name": null
+          },
+          "id": "",
+          "index": 0,
+          "type": "function"
+        }
+      },
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1712788218,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.1-native"
+}
diff --git a/integration-tests/models/test_bloom_560m.py b/integration-tests/models/test_bloom_560m.py
index bdcbdc78..d413519e 100644
--- a/integration-tests/models/test_bloom_560m.py
+++ b/integration-tests/models/test_bloom_560m.py
@@ -13,6 +13,7 @@ async def bloom_560(bloom_560_handle):
     return bloom_560_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m(bloom_560, response_snapshot):
     response = await bloom_560.generate(
@@ -27,6 +28,7 @@ async def test_bloom_560m(bloom_560, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_all_params(bloom_560, response_snapshot):
     response = await bloom_560.generate(
@@ -49,6 +51,7 @@ async def test_bloom_560m_all_params(bloom_560, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_load(bloom_560, generate_load, response_snapshot):
     responses = await generate_load(
diff --git a/integration-tests/models/test_bloom_560m_sharded.py b/integration-tests/models/test_bloom_560m_sharded.py
index 3995f9e5..f9e8ed9c 100644
--- a/integration-tests/models/test_bloom_560m_sharded.py
+++ b/integration-tests/models/test_bloom_560m_sharded.py
@@ -13,6 +13,7 @@ async def bloom_560m_sharded(bloom_560m_sharded_handle):
     return bloom_560m_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
     response = await bloom_560m_sharded.generate(
@@ -27,6 +28,7 @@ async def test_bloom_560m_sharded(bloom_560m_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_bloom_560m_sharded_load(
     bloom_560m_sharded, generate_load, response_snapshot
diff --git a/integration-tests/models/test_chat_llama.py b/integration-tests/models/test_chat_llama.py
new file mode 100644
index 00000000..10df6dbd
--- /dev/null
+++ b/integration-tests/models/test_chat_llama.py
@@ -0,0 +1,43 @@
+import pytest
+import json
+
+from text_generation.types import GrammarType
+
+
+@pytest.fixture(scope="module")
+def flash_llama_chat_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_chat(flash_llama_chat_handle):
+    await flash_llama_chat_handle.health(300)
+    return flash_llama_chat_handle.client
+
+
+@pytest.mark.private
+async def test_flash_llama_simple(flash_llama_chat, response_snapshot):
+    response = await flash_llama_chat.chat(
+        max_tokens=100,
+        seed=1,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+
+    print(repr(response.choices[0].message.content))
+    assert (
+        response.choices[0].message.content
+        == "As of your last question, the weather in Brooklyn, New York, is typically hot and humid throughout the year. The suburbs around New York City are jealously sheltered, and at least in the Lower Bronx, there are very few outdoor environments to explore in the middle of urban confines. In fact, typical times for humidity levels in Brooklyn include:\n\n- Early morning: 80-85% humidity, with occas"
+    )
+    assert response == response_snapshot
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
new file mode 100644
index 00000000..0efb6693
--- /dev/null
+++ b/integration-tests/models/test_completion_prompts.py
@@ -0,0 +1,112 @@
+import pytest
+import requests
+import json
+from aiohttp import ClientSession
+
+from text_generation.types import (
+    Completion,
+)
+
+
+@pytest.fixture(scope="module")
+def flash_llama_completion_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_completion(flash_llama_completion_handle):
+    await flash_llama_completion_handle.health(300)
+    return flash_llama_completion_handle.client
+
+
+# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
+# method for it. Instead, we use the `requests` library to make the HTTP request directly.
+
+
+@pytest.mark.release
+def test_flash_llama_completion_single_prompt(
+    flash_llama_completion, response_snapshot
+):
+    response = requests.post(
+        f"{flash_llama_completion.base_url}/v1/completions",
+        json={
+            "model": "tgi",
+            "prompt": "Say this is a test",
+            "max_tokens": 5,
+            "seed": 0,
+        },
+        headers=flash_llama_completion.headers,
+        stream=False,
+    )
+    response = response.json()
+    assert len(response["choices"]) == 1
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
+    response = requests.post(
+        f"{flash_llama_completion.base_url}/v1/completions",
+        json={
+            "model": "tgi",
+            "prompt": ["Say", "this", "is", "a"],
+            "max_tokens": 10,
+            "seed": 0,
+        },
+        headers=flash_llama_completion.headers,
+        stream=False,
+    )
+    response = response.json()
+    assert len(response["choices"]) == 4
+
+    all_indexes = [choice["index"] for choice in response["choices"]]
+    all_indexes.sort()
+    assert all_indexes == [0, 1, 2, 3]
+
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+async def test_flash_llama_completion_many_prompts_stream(
+    flash_llama_completion, response_snapshot
+):
+    request = {
+        "model": "tgi",
+        "prompt": [
+            "What color is the sky?",
+            "Is water wet?",
+            "What is the capital of France?",
+            "def mai",
+        ],
+        "max_tokens": 10,
+        "seed": 0,
+        "stream": True,
+    }
+
+    url = f"{flash_llama_completion.base_url}/v1/completions"
+
+    chunks = []
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(Completion(**c))
+                    assert "choices" in c
+                    assert 0 <= c["choices"][0]["index"] <= 4
+
+    assert response.status == 200
+    assert chunks == response_snapshot
diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py
new file mode 100644
index 00000000..b500b15d
--- /dev/null
+++ b/integration-tests/models/test_flash_awq.py
@@ -0,0 +1,73 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_awq_handle(launcher):
+    with launcher(
+        "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq",
+        num_shard=1,
+        quantize="awq",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_awq(flash_llama_awq_handle):
+    await flash_llama_awq_handle.health(300)
+    return flash_llama_awq_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
+        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "\nWhat is the difference between Deep Learning and Machine"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
+        "What is Deep Learning?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_llama_awq_load(flash_llama_awq, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_llama_awq, "What is Deep Learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all(
+        [
+            r.generated_text
+            == "\nWhat is the difference between Deep Learning and Machine"
+            for r in responses
+        ]
+    )
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_awq_sharded.py b/integration-tests/models/test_flash_awq_sharded.py
new file mode 100644
index 00000000..4cf9b171
--- /dev/null
+++ b/integration-tests/models/test_flash_awq_sharded.py
@@ -0,0 +1,53 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_awq_handle_sharded(launcher):
+    with launcher(
+        "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq",
+        num_shard=2,
+        quantize="awq",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):
+    await flash_llama_awq_handle_sharded.health(300)
+    return flash_llama_awq_handle_sharded.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot):
+    response = await flash_llama_awq_sharded.generate(
+        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "\nWhat is the difference between Deep Learning and Machine"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_llama_awq_load_sharded(
+    flash_llama_awq_sharded, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_awq_sharded, "What is Deep Learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all(
+        [
+            r.generated_text
+            == "\nWhat is the difference between Deep Learning and Machine"
+            for r in responses
+        ]
+    )
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_falcon.py b/integration-tests/models/test_flash_falcon.py
index eac91984..0fb40fe7 100644
--- a/integration-tests/models/test_flash_falcon.py
+++ b/integration-tests/models/test_flash_falcon.py
@@ -13,6 +13,7 @@ async def flash_falcon(flash_falcon_handle):
     return flash_falcon_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_falcon(flash_falcon, response_snapshot):
@@ -26,6 +27,7 @@ async def test_flash_falcon(flash_falcon, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_falcon_all_params(flash_falcon, response_snapshot):
@@ -49,6 +51,7 @@ async def test_flash_falcon_all_params(flash_falcon, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_falcon_load(flash_falcon, generate_load, response_snapshot):
diff --git a/integration-tests/models/test_flash_gemma.py b/integration-tests/models/test_flash_gemma.py
new file mode 100644
index 00000000..7bee8dea
--- /dev/null
+++ b/integration-tests/models/test_flash_gemma.py
@@ -0,0 +1,61 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_gemma_handle(launcher):
+    with launcher("google/gemma-2b", num_shard=1) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_gemma(flash_gemma_handle):
+    await flash_gemma_handle.health(300)
+    return flash_gemma_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma(flash_gemma, response_snapshot):
+    response = await flash_gemma.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
+    response = await flash_gemma.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_load(flash_gemma, generate_load, response_snapshot):
+    responses = await generate_load(flash_gemma, "Test request", max_new_tokens=10, n=4)
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_gemma_gptq.py b/integration-tests/models/test_flash_gemma_gptq.py
new file mode 100644
index 00000000..79d4cf24
--- /dev/null
+++ b/integration-tests/models/test_flash_gemma_gptq.py
@@ -0,0 +1,67 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_gemma_gptq_handle(launcher):
+    with launcher("TechxGenus/gemma-2b-GPTQ", num_shard=1, quantize="gptq") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_gemma_gptq(flash_gemma_gptq_handle):
+    await flash_gemma_gptq_handle.health(300)
+    return flash_gemma_gptq_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_gptq(flash_gemma_gptq, ignore_logprob_response_snapshot):
+    response = await flash_gemma_gptq.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == ignore_logprob_response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_gptq_all_params(
+    flash_gemma_gptq, ignore_logprob_response_snapshot
+):
+    response = await flash_gemma_gptq.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == ignore_logprob_response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_gptq_load(
+    flash_gemma_gptq, generate_load, ignore_logprob_response_snapshot
+):
+    responses = await generate_load(
+        flash_gemma_gptq, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == ignore_logprob_response_snapshot
diff --git a/integration-tests/models/test_flash_gpt2.py b/integration-tests/models/test_flash_gpt2.py
new file mode 100644
index 00000000..cd73d0a3
--- /dev/null
+++ b/integration-tests/models/test_flash_gpt2.py
@@ -0,0 +1,46 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_gpt2_handle(launcher):
+    with launcher("openai-community/gpt2", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_gpt2(flash_gpt2_handle):
+    await flash_gpt2_handle.health(300)
+    return flash_gpt2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_gpt2(flash_gpt2, response_snapshot):
+    response = await flash_gpt2.generate(
+        "What is deep learning?",
+        max_new_tokens=10,
+        decoder_input_details=True,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_gpt2_load(flash_gpt2, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_gpt2,
+        "What is deep learning?",
+        max_new_tokens=10,
+        n=4,
+    )
+
+    generated_texts = [r.generated_text for r in responses]
+
+    assert len(generated_texts) == 4
+    assert all(
+        [text == generated_texts[0] for text in generated_texts]
+    ), generated_texts
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_grammar_llama.py b/integration-tests/models/test_flash_grammar_llama.py
new file mode 100644
index 00000000..ce1cf787
--- /dev/null
+++ b/integration-tests/models/test_flash_grammar_llama.py
@@ -0,0 +1,150 @@
+import pytest
+import json
+
+from text_generation.types import GrammarType
+
+
+@pytest.fixture(scope="module")
+def flash_llama_grammar_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_grammar(flash_llama_grammar_handle):
+    await flash_llama_grammar_handle.health(300)
+    return flash_llama_grammar_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_llama_grammar(flash_llama_grammar, response_snapshot):
+    response = await flash_llama_grammar.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+async def test_flash_llama_grammar_regex(flash_llama_grammar, response_snapshot):
+    response = await flash_llama_grammar.generate(
+        "Whats Googles DNS",
+        max_new_tokens=10,
+        decoder_input_details=True,
+        seed=0,
+        grammar={
+            "type": GrammarType.Regex,  # "regex"
+            "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
+        },
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response.generated_text == "42.1.1.101"
+    assert response == response_snapshot
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+async def test_flash_llama_grammar_json(flash_llama_grammar, response_snapshot):
+    response = await flash_llama_grammar.generate(
+        "info: david holtz like trees and has two cats. ",
+        max_new_tokens=100,
+        decoder_input_details=True,
+        seed=0,
+        grammar={
+            "type": GrammarType.Json,  # "json"
+            "value": json.dumps(
+                {
+                    "type": "object",
+                    "$id": "https://example.com/person.schema.json",
+                    "$schema": "https://json-schema.org/draft/2020-12/schema",
+                    "title": "Person",
+                    "properties": {
+                        "firstName": {
+                            "type": "string",
+                            "description": "The person'''s first name.",
+                        },
+                        "lastName": {
+                            "type": "string",
+                            "description": "The person'''s last name.",
+                        },
+                        "hobby": {
+                            "description": "The person'''s hobby.",
+                            "type": "string",
+                        },
+                        "numCats": {
+                            "description": "The number of cats the person has.",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                    },
+                    "required": ["firstName", "lastName", "hobby", "numCats"],
+                }
+            ),
+        },
+    )
+
+    assert response.details.generated_tokens == 30
+    assert (
+        response.generated_text
+        == '{"firstName":"David","hobby":"Trees","lastName":"Holtz","numCats":2}'
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+async def test_flash_llama_grammar_load(
+    flash_llama_grammar, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_grammar,
+        "name: david. email:  ",
+        max_new_tokens=10,
+        n=4,
+        stop_sequences=[".com"],
+        seed=0,
+        grammar={
+            "type": GrammarType.Regex,  # "regex"
+            "value": "[\\w-]+@([\\w-]+\\.)+[\\w-]+",  # email regex
+        },
+    )
+
+    assert len(responses) == 4
+
+    expected = "123456@gmail.com"
+
+    for response in responses:
+        assert response.generated_text == expected
+
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
+
+
+# this is the same as the above test, but only fires off a single request
+# this is only to ensure that the parallel and single inference produce the same result
+@pytest.mark.skip
+@pytest.mark.asyncio
+async def test_flash_llama_grammar_single_load_instance(
+    flash_llama_grammar, generate_load, response_snapshot
+):
+    response = await flash_llama_grammar.generate(
+        "name: david. email:  ",
+        max_new_tokens=10,
+        stop_sequences=[".com"],
+        seed=0,
+        grammar={
+            "type": GrammarType.Regex,  # "regex"
+            "value": "[\\w-]+@([\\w-]+\\.)+[\\w-]+",  # email regex
+        },
+    )
+
+    # assert response.details.generated_tokens == 30
+    assert response.generated_text == "123456@gmail.com"
+
+    assert response == response_snapshot
diff --git a/integration-tests/models/test_flash_llama_exl2.py b/integration-tests/models/test_flash_llama_exl2.py
new file mode 100644
index 00000000..7169c999
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_exl2.py
@@ -0,0 +1,76 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_exl2_handle(launcher):
+    with launcher(
+        "turboderp/Llama-3-8B-Instruct-exl2",
+        revision="2.5bpw",
+        # Set max input length to avoid OOM due to extremely large
+        # scratch buffer.
+        max_input_length=1024,
+        num_shard=1,
+        quantize="exl2",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_exl2(flash_llama_exl2_handle):
+    await flash_llama_exl2_handle.health(300)
+    return flash_llama_exl2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_exl2(flash_llama_exl2, ignore_logprob_response_snapshot):
+    response = await flash_llama_exl2.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == ignore_logprob_response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_exl2_all_params(
+    flash_llama_exl2, ignore_logprob_response_snapshot
+):
+    response = await flash_llama_exl2.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert (
+        response.generated_text == 'Test request. The server responds with a "200 OK"'
+    )
+    assert response == ignore_logprob_response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_exl2_load(
+    flash_llama_exl2, generate_load, ignore_logprob_response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_exl2, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == ignore_logprob_response_snapshot
diff --git a/integration-tests/models/test_flash_llama_gptq.py b/integration-tests/models/test_flash_llama_gptq.py
index b87f054b..94a48e49 100644
--- a/integration-tests/models/test_flash_llama_gptq.py
+++ b/integration-tests/models/test_flash_llama_gptq.py
@@ -3,7 +3,9 @@ import pytest
 
 @pytest.fixture(scope="module")
 def flash_llama_gptq_handle(launcher):
-    with launcher("huggingface/llama-7b-gptq", num_shard=2, quantize="gptq") as handle:
+    with launcher(
+        "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="gptq"
+    ) as handle:
         yield handle
 
 
@@ -13,6 +15,7 @@ async def flash_llama_gptq(flash_llama_gptq_handle):
     return flash_llama_gptq_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
@@ -24,6 +27,7 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
@@ -46,6 +50,7 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llama_gptq_load(
diff --git a/integration-tests/models/test_flash_llama_marlin.py b/integration-tests/models/test_flash_llama_marlin.py
new file mode 100644
index 00000000..a89a1e41
--- /dev/null
+++ b/integration-tests/models/test_flash_llama_marlin.py
@@ -0,0 +1,66 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_llama_marlin_handle(launcher):
+    with launcher(
+        "neuralmagic/llama-2-7b-chat-marlin", num_shard=2, quantize="marlin"
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_marlin(flash_llama_marlin_handle):
+    await flash_llama_marlin_handle.health(300)
+    return flash_llama_marlin_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot):
+    response = await flash_llama_marlin.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_marlin_all_params(flash_llama_marlin, response_snapshot):
+    response = await flash_llama_marlin.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_marlin_load(
+    flash_llama_marlin, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_llama_marlin, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_medusa.py b/integration-tests/models/test_flash_medusa.py
new file mode 100644
index 00000000..27db5665
--- /dev/null
+++ b/integration-tests/models/test_flash_medusa.py
@@ -0,0 +1,64 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_medusa_handle(launcher):
+    with launcher(
+        "FasterDecoding/medusa-vicuna-7b-v1.3", num_shard=2, revision="refs/pr/1"
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_medusa(flash_medusa_handle):
+    await flash_medusa_handle.health(300)
+    return flash_medusa_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_medusa_simple(flash_medusa, response_snapshot):
+    response = await flash_medusa.generate(
+        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_medusa_all_params(flash_medusa, response_snapshot):
+    response = await flash_medusa.generate(
+        "What is Deep Learning?",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_medusa_load(flash_medusa, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_medusa, "What is Deep Learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text for r in responses]}"
+    assert (
+        responses[0].generated_text == "\nDeep learning is a subset of machine learning"
+    )
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_mistral.py b/integration-tests/models/test_flash_mistral.py
new file mode 100644
index 00000000..52b51928
--- /dev/null
+++ b/integration-tests/models/test_flash_mistral.py
@@ -0,0 +1,61 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_mistral_handle(launcher):
+    with launcher("mistralai/Mistral-7B-Instruct-v0.1") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_mistral(flash_mistral_handle):
+    await flash_mistral_handle.health(300)
+    return flash_mistral_handle.client
+
+
+@pytest.mark.asyncio
+async def test_flash_mistral(flash_mistral, response_snapshot):
+    response = await flash_mistral.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response.generated_text == ": Let n = 10 - 1"
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_mistral_all_params(flash_mistral, response_snapshot):
+    response = await flash_mistral.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_flash_mistral_load(flash_mistral, generate_load, response_snapshot):
+    responses = await generate_load(
+        flash_mistral, "Test request", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+    assert responses[0].generated_text == ": Let n = 10 - 1"
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_neox.py b/integration-tests/models/test_flash_neox.py
index 0289c61d..31848dae 100644
--- a/integration-tests/models/test_flash_neox.py
+++ b/integration-tests/models/test_flash_neox.py
@@ -13,6 +13,7 @@ async def flash_neox(flash_neox_handle):
     return flash_neox_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_neox(flash_neox, response_snapshot):
@@ -26,6 +27,7 @@ async def test_flash_neox(flash_neox, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_flash_neox_load(flash_neox, generate_load, response_snapshot):
diff --git a/integration-tests/models/test_flash_neox_sharded.py b/integration-tests/models/test_flash_neox_sharded.py
index 8a491915..1f1e7225 100644
--- a/integration-tests/models/test_flash_neox_sharded.py
+++ b/integration-tests/models/test_flash_neox_sharded.py
@@ -13,6 +13,7 @@ async def flash_neox_sharded(flash_neox_sharded_handle):
     return flash_neox_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_neox(flash_neox_sharded, response_snapshot):
     response = await flash_neox_sharded.generate(
@@ -25,6 +26,7 @@ async def test_flash_neox(flash_neox_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_neox_load(flash_neox_sharded, generate_load, response_snapshot):
     responses = await generate_load(
diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py
new file mode 100644
index 00000000..3ead3150
--- /dev/null
+++ b/integration-tests/models/test_flash_pali_gemma.py
@@ -0,0 +1,64 @@
+import pytest
+import requests
+import io
+import base64
+
+
+@pytest.fixture(scope="module")
+def flash_pali_gemma_handle(launcher):
+    with launcher(
+        "google/paligemma-3b-pt-224",
+        num_shard=1,
+        revision="float16",
+        max_input_length=4000,
+        max_total_tokens=4096,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_pali_gemma(flash_pali_gemma_handle):
+    await flash_pali_gemma_handle.health(300)
+    return flash_pali_gemma_handle.client
+
+
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
+    cow = get_cow_beach()
+    inputs = f"![]({cow})Where is the cow standing?\n"
+    response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)
+
+    assert response.generated_text == "beach"
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
+    chicken = get_chicken()
+    cow_beach = get_cow_beach()
+    response = await flash_pali_gemma.generate(
+        f"caption![]({chicken})![]({cow_beach})\n",
+        max_new_tokens=20,
+    )
+    # Is PaliGemma not able to handle two separate images? At least we
+    # get output showing that both images are used.
+    assert (
+        response.generated_text == "image result for chicken on the beach"
+    ), f"{repr(response.generated_text)}"
+    assert response == response_snapshot
diff --git a/integration-tests/models/test_flash_phi.py b/integration-tests/models/test_flash_phi.py
new file mode 100644
index 00000000..73bb5edc
--- /dev/null
+++ b/integration-tests/models/test_flash_phi.py
@@ -0,0 +1,63 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_phi_handle(launcher):
+    with launcher("microsoft/phi-2", num_shard=1) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_phi(flash_phi_handle):
+    await flash_phi_handle.health(300)
+    return flash_phi_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_phi(flash_phi, response_snapshot):
+    response = await flash_phi.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response.generated_text == ': {request}")\n        response = self'
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_phi_all_params(flash_phi, response_snapshot):
+    response = await flash_phi.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["network"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 6
+    assert response.generated_text == "Test request to send data over a network"
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_phi_load(flash_phi, generate_load, response_snapshot):
+    responses = await generate_load(flash_phi, "Test request", max_new_tokens=10, n=4)
+
+    assert len(responses) == 4
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+    assert responses[0].generated_text == ': {request}")\n        response = self'
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_qwen2.py b/integration-tests/models/test_flash_qwen2.py
new file mode 100644
index 00000000..c64f8732
--- /dev/null
+++ b/integration-tests/models/test_flash_qwen2.py
@@ -0,0 +1,62 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_qwen2_handle(launcher):
+    with launcher("Qwen/Qwen1.5-0.5B") as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_qwen2(flash_qwen2_handle):
+    await flash_qwen2_handle.health(300)
+    return flash_qwen2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_qwen2(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response.generated_text == "\n# Create a request\nrequest = requests.get"
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_qwen2_all_params(flash_qwen2, response_snapshot):
+    response = await flash_qwen2.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_flash_qwen2_load(flash_qwen2, generate_load, response_snapshot):
+    responses = await generate_load(flash_qwen2, "Test request", max_new_tokens=10, n=4)
+
+    assert len(responses) == 4
+    assert all(
+        [r.generated_text == responses[0].generated_text for r in responses]
+    ), f"{[r.generated_text  for r in responses]}"
+    assert responses[0].generated_text == "\n# Create a request\nrequest = requests.get"
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_santacoder.py b/integration-tests/models/test_flash_santacoder.py
index 0f005f15..96a36aba 100644
--- a/integration-tests/models/test_flash_santacoder.py
+++ b/integration-tests/models/test_flash_santacoder.py
@@ -13,6 +13,7 @@ async def flash_santacoder(flash_santacoder_handle):
     return flash_santacoder_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_santacoder(flash_santacoder, response_snapshot):
     response = await flash_santacoder.generate(
@@ -23,6 +24,7 @@ async def test_flash_santacoder(flash_santacoder, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_flash_santacoder_load(
     flash_santacoder, generate_load, response_snapshot
diff --git a/integration-tests/models/test_flash_starcoder.py b/integration-tests/models/test_flash_starcoder.py
index 64e8b27c..dc5a8a53 100644
--- a/integration-tests/models/test_flash_starcoder.py
+++ b/integration-tests/models/test_flash_starcoder.py
@@ -13,6 +13,7 @@ async def flash_starcoder(flash_starcoder_handle):
     return flash_starcoder_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_starcoder(flash_starcoder, response_snapshot):
@@ -24,6 +25,7 @@ async def test_flash_starcoder(flash_starcoder, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_starcoder_default_params(flash_starcoder, response_snapshot):
@@ -40,6 +42,7 @@ async def test_flash_starcoder_default_params(flash_starcoder, response_snapshot
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_starcoder_load(flash_starcoder, generate_load, response_snapshot):
diff --git a/integration-tests/models/test_flash_starcoder2.py b/integration-tests/models/test_flash_starcoder2.py
new file mode 100644
index 00000000..88341cfe
--- /dev/null
+++ b/integration-tests/models/test_flash_starcoder2.py
@@ -0,0 +1,58 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_starcoder2_handle(launcher):
+    with launcher("bigcode/starcoder2-3b", num_shard=2) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_starcoder2(flash_starcoder2_handle):
+    await flash_starcoder2_handle.health(300)
+    return flash_starcoder2_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder2(flash_starcoder2, response_snapshot):
+    response = await flash_starcoder2.generate(
+        "def print_hello", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder2_default_params(flash_starcoder2, response_snapshot):
+    response = await flash_starcoder2.generate(
+        "def print_hello",
+        max_new_tokens=60,
+        temperature=0.2,
+        top_p=0.95,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 60
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_starcoder2_load(
+    flash_starcoder2, generate_load, response_snapshot
+):
+    responses = await generate_load(
+        flash_starcoder2, "def print_hello", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_flash_starcoder_gptq.py b/integration-tests/models/test_flash_starcoder_gptq.py
index 608101fb..f1007d6e 100644
--- a/integration-tests/models/test_flash_starcoder_gptq.py
+++ b/integration-tests/models/test_flash_starcoder_gptq.py
@@ -13,22 +13,22 @@ async def flash_starcoder_gptq(flash_starcoder_gptq_handle):
     return flash_starcoder_gptq_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
-@pytest.mark.private
-async def test_flash_starcoder_gptq(flash_starcoder_gptq, response_snapshot):
+async def test_flash_starcoder_gptq(flash_starcoder_gptq, generous_response_snapshot):
     response = await flash_starcoder_gptq.generate(
         "def geometric_mean(L: List[float]):",
         max_new_tokens=20,
         decoder_input_details=True,
     )
     assert response.details.generated_tokens == 20
-    assert response == response_snapshot
+    assert response == generous_response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_starcoder_gptq_default_params(
-    flash_starcoder_gptq, response_snapshot
+    flash_starcoder_gptq, generous_response_snapshot
 ):
     response = await flash_starcoder_gptq.generate(
         "def geometric_mean(L: List[float]):",
@@ -39,13 +39,13 @@ async def test_flash_starcoder_gptq_default_params(
         seed=0,
     )
     assert response.details.generated_tokens == 20
-    assert response == response_snapshot
+    assert response == generous_response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_starcoder_gptq_load(
-    flash_starcoder_gptq, generate_load, response_snapshot
+    flash_starcoder_gptq, generate_load, generous_response_snapshot
 ):
     responses = await generate_load(
         flash_starcoder_gptq,
@@ -57,4 +57,4 @@ async def test_flash_starcoder_gptq_load(
     assert len(responses) == 4
     assert all([r.generated_text == responses[0].generated_text for r in responses])
 
-    assert responses == response_snapshot
+    assert responses == generous_response_snapshot
diff --git a/integration-tests/models/test_grammar_llama.py b/integration-tests/models/test_grammar_llama.py
new file mode 100644
index 00000000..4face9e1
--- /dev/null
+++ b/integration-tests/models/test_grammar_llama.py
@@ -0,0 +1,71 @@
+import pytest
+import json
+
+from text_generation.types import GrammarType
+
+
+@pytest.fixture(scope="module")
+def non_flash_llama_grammar_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        num_shard=1,
+        disable_grammar_support=False,
+        use_flash_attention=False,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def non_flash_llama_grammar(non_flash_llama_grammar_handle):
+    await non_flash_llama_grammar_handle.health(300)
+    return non_flash_llama_grammar_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.skip
+@pytest.mark.asyncio
+async def test_non_flash_llama_grammar_json(non_flash_llama_grammar, response_snapshot):
+    response = await non_flash_llama_grammar.generate(
+        "info: david holtz like trees and has two cats. ",
+        max_new_tokens=100,
+        decoder_input_details=True,
+        seed=0,
+        grammar={
+            "type": GrammarType.Json,
+            "value": json.dumps(
+                {
+                    "type": "object",
+                    "$id": "https://example.com/person.schema.json",
+                    "$schema": "https://json-schema.org/draft/2020-12/schema",
+                    "title": "Person",
+                    "properties": {
+                        "firstName": {
+                            "type": "string",
+                            "description": "The person'''s first name.",
+                        },
+                        "lastName": {
+                            "type": "string",
+                            "description": "The person'''s last name.",
+                        },
+                        "hobby": {
+                            "description": "The person'''s hobby.",
+                            "type": "string",
+                        },
+                        "numCats": {
+                            "description": "The number of cats the person has.",
+                            "type": "integer",
+                            "minimum": 0,
+                        },
+                    },
+                    "required": ["firstName", "lastName", "hobby", "numCats"],
+                }
+            ),
+        },
+    )
+
+    assert response.details.generated_tokens == 30
+    assert (
+        response.generated_text
+        == '{"firstName":"David","hobby":"Trees","lastName":"Holtz","numCats":2}'
+    )
+    assert response == response_snapshot
diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py
new file mode 100644
index 00000000..ea25fa1c
--- /dev/null
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@@ -0,0 +1,103 @@
+import pytest
+import requests
+from pydantic import BaseModel
+from typing import List
+
+
+@pytest.fixture(scope="module")
+def llama_grammar_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        num_shard=1,
+        disable_grammar_support=False,
+        use_flash_attention=False,
+        max_batch_prefill_tokens=3000,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def llama_grammar(llama_grammar_handle):
+    await llama_grammar_handle.health(300)
+    return llama_grammar_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot):
+
+    class Weather(BaseModel):
+        unit: str
+        temperature: List[int]
+
+    # send the request
+    response = requests.post(
+        f"{llama_grammar.base_url}/v1/chat/completions",
+        headers=llama_grammar.headers,
+        json={
+            "model": "tgi",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}",
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
+                },
+            ],
+            "seed": 42,
+            "max_tokens": 500,
+            "response_format": {"type": "json_object", "value": Weather.schema()},
+        },
+    )
+
+    chat_completion = response.json()
+    called = chat_completion["choices"][0]["message"]["content"]
+
+    assert response.status_code == 200
+    assert (
+        called
+        == '{\n  "temperature": [\n    35,\n    34,\n    36\n  ],\n  "unit": "°c"\n}'
+    )
+    assert chat_completion == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_grammar_response_format_llama_error_if_tools_not_installed(
+    llama_grammar,
+):
+    class Weather(BaseModel):
+        unit: str
+        temperature: List[int]
+
+    # send the request
+    response = requests.post(
+        f"{llama_grammar.base_url}/v1/chat/completions",
+        headers=llama_grammar.headers,
+        json={
+            "model": "tgi",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}",
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
+                },
+            ],
+            "seed": 42,
+            "max_tokens": 500,
+            "tools": [],
+            "response_format": {"type": "json_object", "value": Weather.schema()},
+        },
+    )
+
+    # 422 means the server was unable to process the request because it contains invalid data.
+    assert response.status_code == 422
+    assert response.json() == {
+        "error": "Grammar and tools are mutually exclusive",
+        "error_type": "grammar and tools",
+    }
diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py
index 5659dd5c..b7725f0b 100644
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@@ -1,10 +1,11 @@
 import pytest
+import base64
 
 
 @pytest.fixture(scope="module")
 def idefics_handle(launcher):
     with launcher(
-        "HuggingFaceM4/idefics-9b-instruct", num_shard=2
+        "HuggingFaceM4/idefics-9b-instruct", num_shard=2, dtype="float16"
     ) as handle:
         yield handle
 
@@ -15,29 +16,67 @@ async def idefics(idefics_handle):
     return idefics_handle.client
 
 
+# TODO fix the server parsser to count inline image tokens correctly
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
 @pytest.mark.asyncio
 async def test_idefics(idefics, response_snapshot):
+    chicken = get_chicken()
     response = await idefics.generate(
-        "User:![](https://temp-5681.s3.us-west-2.amazonaws.com/chicken_on_money.png)Can you tell me a very short story based on the image?",
+        f"User:![]({chicken})Can you tell me a very short story based on the image?",
         max_new_tokens=10,
         decoder_input_details=True,
     )
 
     assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text == " \nAssistant: A rooster stands"
+    ), f"{repr(response.generated_text)}"
     assert response == response_snapshot
 
 
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_idefics_two_images(idefics, response_snapshot):
+    chicken = get_chicken()
+    cow_beach = get_cow_beach()
+    response = await idefics.generate(
+        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
+        max_new_tokens=20,
+    )
+    assert (
+        response.generated_text == " The cow and chicken are on a beach."
+    ), f"{repr(response.generated_text)}"
+    assert response == response_snapshot
+
+
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_idefics_load(idefics, generate_load, response_snapshot):
+    chicken = get_chicken()
     responses = await generate_load(
         idefics,
-        "User:![](https://temp-5681.s3.us-west-2.amazonaws.com/chicken_on_money.png)Can you tell me a very short story based on the image?",
+        f"User:![]({chicken})Can you tell me a very short story based on the image?",
         max_new_tokens=10,
         n=4,
     )
 
     generated_texts = [r.generated_text for r in responses]
 
+    assert (
+        generated_texts[0] == " \nAssistant: A rooster stands"
+    ), f"{response.generated_text}"
     assert len(generated_texts) == 4
     assert generated_texts, all(
         [text == generated_texts[0] for text in generated_texts]
diff --git a/integration-tests/models/test_idefics2.py b/integration-tests/models/test_idefics2.py
new file mode 100644
index 00000000..c5f48da3
--- /dev/null
+++ b/integration-tests/models/test_idefics2.py
@@ -0,0 +1,104 @@
+import pytest
+import base64
+
+
+# TODO fix the server parsser to count inline image tokens correctly
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.fixture(scope="module")
+def flash_idefics2_next_handle(launcher):
+    with launcher(
+        "HuggingFaceM4/idefics2-8b",
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_idefics2_next(flash_idefics2_next_handle):
+    await flash_idefics2_next_handle.health(300)
+    return flash_idefics2_next_handle.client
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot):
+    chicken = get_chicken()
+    response = await flash_idefics2_next.generate(
+        f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
+        max_new_tokens=10,
+    )
+    assert (
+        response.generated_text == " A chicken is sitting on a pile of money."
+    ), f"{repr(response.generated_text)}"
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
+    chicken = get_chicken()
+    cow_beach = get_cow_beach()
+    response = await flash_idefics2_next.generate(
+        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
+        max_new_tokens=20,
+    )
+    assert (
+        response.generated_text
+        == " The cow is standing on the beach and the chicken is sitting on a pile of money."
+    ), f"{repr(response.generated_text)}"
+    assert response.details.generated_tokens == 19
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snapshot):
+    response = await flash_idefics2_next.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_idefics2_next_load(
+    flash_idefics2_next, generate_load, response_snapshot
+):
+    chicken = get_chicken()
+    responses = await generate_load(
+        flash_idefics2_next,
+        f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
+        max_new_tokens=10,
+        n=4,
+    )
+    generated_texts = [r.generated_text for r in responses]
+    assert generated_texts[0] == " A chicken is sitting on a pile of money."
+    assert len(generated_texts) == 4
+    assert all([r.generated_text == generated_texts[0] for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_llava_next.py b/integration-tests/models/test_llava_next.py
new file mode 100644
index 00000000..ea277d71
--- /dev/null
+++ b/integration-tests/models/test_llava_next.py
@@ -0,0 +1,87 @@
+import pytest
+import base64
+
+
+# TODO fix the server parsser to count inline image tokens correctly
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.fixture(scope="module")
+def flash_llava_next_handle(launcher):
+    with launcher(
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+        num_shard=4,
+        max_input_length=4000,
+        max_total_tokens=4096,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llava_next(flash_llava_next_handle):
+    await flash_llava_next_handle.health(300)
+    return flash_llava_next_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llava_next_simple(flash_llava_next, response_snapshot):
+    chicken = get_chicken()
+    response = await flash_llava_next.generate(
+        f"User:![]({chicken})Can you tell me a very short story based on the image?",
+        max_new_tokens=10,
+    )
+    assert (
+        response.generated_text == "\n\nOnce upon a time, there was a"
+    ), f"{repr(response.generated_text)}"
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot):
+    response = await flash_llava_next.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 6
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llava_next_load(
+    flash_llava_next, generate_load, response_snapshot
+):
+    chicken = get_chicken()
+    responses = await generate_load(
+        flash_llava_next,
+        f"User:![]({chicken})Can you tell me a very short story based on the image?",
+        max_new_tokens=10,
+        n=4,
+    )
+    generated_texts = [r.generated_text for r in responses]
+    assert generated_texts[0] == "\n\nOnce upon a time, there was a"
+    assert len(generated_texts) == 4
+    assert all([r.generated_text == generated_texts[0] for r in responses])
+
+    assert responses == response_snapshot
diff --git a/integration-tests/models/test_mamba.py b/integration-tests/models/test_mamba.py
new file mode 100644
index 00000000..bc946de8
--- /dev/null
+++ b/integration-tests/models/test_mamba.py
@@ -0,0 +1,68 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def fused_kernel_mamba_handle(launcher):
+    with launcher("state-spaces/mamba-130m", num_shard=1) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def fused_kernel_mamba(fused_kernel_mamba_handle):
+    await fused_kernel_mamba_handle.health(300)
+    return fused_kernel_mamba_handle.client
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_mamba(fused_kernel_mamba, response_snapshot):
+    response = await fused_kernel_mamba.generate(
+        "What is Deep Learning?", max_new_tokens=10
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response.generated_text == "\n\nDeep learning is a new type of machine"
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):
+    response = await fused_kernel_mamba.generate(
+        "blue, red, yellow, ",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "blue, red, yellow, \nand blue colors. A number of different color"
+    )
+    assert response == response_snapshot
+
+
+@pytest.mark.release
+@pytest.mark.asyncio
+async def test_mamba_load(
+    fused_kernel_mamba, generate_load, generous_response_snapshot
+):
+    responses = await generate_load(
+        fused_kernel_mamba, "What is Deep Learning?", max_new_tokens=10, n=4
+    )
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+    assert responses[0].generated_text == "\n\nDeep learning is a new type of machine"
+
+    assert responses == generous_response_snapshot
diff --git a/integration-tests/models/test_mpt.py b/integration-tests/models/test_mpt.py
index d58a8c5a..1832910a 100644
--- a/integration-tests/models/test_mpt.py
+++ b/integration-tests/models/test_mpt.py
@@ -13,6 +13,7 @@ async def mpt_sharded(mpt_sharded_handle):
     return mpt_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_mpt(mpt_sharded, response_snapshot):
     response = await mpt_sharded.generate(
@@ -29,6 +30,7 @@ async def test_mpt(mpt_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_mpt_load(mpt_sharded, generate_load, response_snapshot):
     responses = await generate_load(
diff --git a/integration-tests/models/test_mt0_base.py b/integration-tests/models/test_mt0_base.py
index 12f23e4c..e53d8ed4 100644
--- a/integration-tests/models/test_mt0_base.py
+++ b/integration-tests/models/test_mt0_base.py
@@ -13,6 +13,7 @@ async def mt0_base(mt0_base_handle):
     return mt0_base_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_mt0_base(mt0_base, response_snapshot):
     response = await mt0_base.generate(
@@ -27,6 +28,7 @@ async def test_mt0_base(mt0_base, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_mt0_base_all_params(mt0_base, response_snapshot):
     response = await mt0_base.generate(
@@ -45,10 +47,11 @@ async def test_mt0_base_all_params(mt0_base, response_snapshot):
         seed=0,
     )
 
-    assert response.details.generated_tokens == 9
+    assert response.details.generated_tokens == 10
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_mt0_base_load(mt0_base, generate_load, response_snapshot):
     responses = await generate_load(
diff --git a/integration-tests/models/test_neox.py b/integration-tests/models/test_neox.py
index 7b88f86a..ee60441d 100644
--- a/integration-tests/models/test_neox.py
+++ b/integration-tests/models/test_neox.py
@@ -15,6 +15,7 @@ async def neox(neox_handle):
     return neox_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_neox(neox, response_snapshot):
@@ -28,6 +29,7 @@ async def test_neox(neox, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_neox_load(neox, generate_load, response_snapshot):
diff --git a/integration-tests/models/test_neox_sharded.py b/integration-tests/models/test_neox_sharded.py
index 8cee8765..a69227c9 100644
--- a/integration-tests/models/test_neox_sharded.py
+++ b/integration-tests/models/test_neox_sharded.py
@@ -15,6 +15,7 @@ async def neox_sharded(neox_sharded_handle):
     return neox_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_neox(neox_sharded, response_snapshot):
@@ -28,6 +29,7 @@ async def test_neox(neox_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.skip
 @pytest.mark.asyncio
 async def test_neox_load(neox_sharded, generate_load, response_snapshot):
diff --git a/integration-tests/models/test_t5_sharded.py b/integration-tests/models/test_t5_sharded.py
index 7c288b23..24003024 100644
--- a/integration-tests/models/test_t5_sharded.py
+++ b/integration-tests/models/test_t5_sharded.py
@@ -3,7 +3,7 @@ import pytest
 
 @pytest.fixture(scope="module")
 def t5_sharded_handle(launcher):
-    with launcher("google/flan-t5-xxl", num_shard=2) as handle:
+    with launcher("google/flan-t5-xxl", num_shard=4) as handle:
         yield handle
 
 
@@ -13,6 +13,7 @@ async def t5_sharded(t5_sharded_handle):
     return t5_sharded_handle.client
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_t5_sharded(t5_sharded, response_snapshot):
     response = await t5_sharded.generate(
@@ -24,6 +25,7 @@ async def test_t5_sharded(t5_sharded, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.release
 @pytest.mark.asyncio
 async def test_t5_sharded_load(t5_sharded, generate_load, response_snapshot):
     responses = await generate_load(
diff --git a/integration-tests/models/test_tools_llama.py b/integration-tests/models/test_tools_llama.py
new file mode 100644
index 00000000..0af3f66a
--- /dev/null
+++ b/integration-tests/models/test_tools_llama.py
@@ -0,0 +1,259 @@
+import pytest
+import json
+
+from text_generation.types import GrammarType
+
+
+@pytest.fixture(scope="module")
+def flash_llama_grammar_tools_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0", num_shard=2, disable_grammar_support=False
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_llama_grammar_tools(flash_llama_grammar_tools_handle):
+    await flash_llama_grammar_tools_handle.health(300)
+    return flash_llama_grammar_tools_handle.client
+
+
+# tools to be used in the following tests
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                },
+                "required": ["location", "format"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_n_day_weather_forecast",
+            "description": "Get an N-day weather forecast",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "format": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The temperature unit to use. Infer this from the users location.",
+                    },
+                    "num_days": {
+                        "type": "integer",
+                        "description": "The number of days to forecast",
+                    },
+                },
+                "required": ["location", "format", "num_days"],
+            },
+        },
+    },
+]
+
+
+@pytest.mark.skip(reason="Takes too long to run")
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools(flash_llama_grammar_tools, response_snapshot):
+    response = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        presence_penalty=-1.1,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+    assert response.choices[0].message.content == None
+    assert response.choices[0].message.tool_calls == [
+        {
+            "id": 0,
+            "type": "function",
+            "function": {
+                "description": None,
+                "name": "get_current_weather",
+                "arguments": {"format": "celsius", "location": "New York, NY"},
+            },
+        }
+    ]
+    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="Takes too long to run")
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_auto(
+    flash_llama_grammar_tools, response_snapshot
+):
+    response = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        tool_choice="auto",
+        presence_penalty=-1.1,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+    assert response.choices[0].message.content == None
+    assert response.choices[0].message.tool_calls == [
+        {
+            "id": 0,
+            "type": "function",
+            "function": {
+                "description": None,
+                "name": "get_current_weather",
+                "arguments": {"format": "celsius", "location": "New York, NY"},
+            },
+        }
+    ]
+
+    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="Takes too long to run")
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_choice(
+    flash_llama_grammar_tools, response_snapshot
+):
+    response = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        tool_choice="get_current_weather",
+        presence_penalty=-1.1,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+    assert response.choices[0].message.content == None
+    assert response.choices[0].message.tool_calls == [
+        {
+            "id": 0,
+            "type": "function",
+            "function": {
+                "description": None,
+                "name": "get_current_weather",
+                "arguments": {"format": "celsius", "location": "New York, NY"},
+            },
+        }
+    ]
+
+    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="Takes too long to run")
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_stream(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        tool_choice="get_current_weather",
+        presence_penalty=-1.1,
+        messages=[
+            {
+                "role": "system",
+                "content": "Youre a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Paris, France?",
+            },
+        ],
+        stream=True,
+    )
+
+    count = 0
+    async for response in responses:
+        count += 1
+
+    assert count == 38
+    assert response == response_snapshot
+
+
+@pytest.mark.skip(reason="Takes too long to run")
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_llama_grammar_tools_insufficient_information(
+    flash_llama_grammar_tools, response_snapshot
+):
+    responses = await flash_llama_grammar_tools.chat(
+        max_tokens=100,
+        seed=8,
+        tools=tools,
+        tool_choice="auto",
+        messages=[
+            {
+                "role": "system",
+                "content": "ONLY RESPOND IF THE USER ASKS A WEATHER RELATED QUESTION",
+            },
+            {
+                "role": "user",
+                "content": "Tell me a story about 3 sea creatures",
+            },
+        ],
+        stream=False,
+    )
+
+    assert responses.choices[0].message.content == None
+    assert responses.choices[0].message.tool_calls == [
+        {
+            "function": {
+                "arguments": {
+                    "error": "Cannot get current weather forecast from specified location and temperature unit. Please try again with different options."
+                },
+                "description": None,
+                "name": "notify_error",
+            },
+            "id": 0,
+            "type": "function",
+        }
+    ]
+
+    assert responses == response_snapshot
diff --git a/integration-tests/poetry.lock b/integration-tests/poetry.lock
index e156c161..3af99942 100644
--- a/integration-tests/poetry.lock
+++ b/integration-tests/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -122,6 +122,17 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.6.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
+    {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
+]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -590,55 +601,113 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "pydantic"
-version = "1.10.12"
-description = "Data validation and settings management using python type hints"
+version = "2.6.4"
+description = "Data validation using Python type hints"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"},
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"},
-    {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"},
-    {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"},
-    {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"},
-    {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"},
-    {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"},
-    {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"},
-    {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"},
-    {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"},
+    {file = "pydantic-2.6.4-py3-none-any.whl", hash = "sha256:cc46fce86607580867bdc3361ad462bab9c222ef042d3da86f2fb333e1d916c5"},
+    {file = "pydantic-2.6.4.tar.gz", hash = "sha256:b1704e0847db01817624a6b86766967f552dd9dbf3afba4004409f908dcc84e6"},
 ]
 
 [package.dependencies]
-typing-extensions = ">=4.2.0"
+annotated-types = ">=0.4.0"
+pydantic-core = "2.16.3"
+typing-extensions = ">=4.6.1"
 
 [package.extras]
-dotenv = ["python-dotenv (>=0.10.4)"]
-email = ["email-validator (>=1.0.3)"]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.16.3"
+description = ""
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.16.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:75b81e678d1c1ede0785c7f46690621e4c6e63ccd9192af1f0bd9d504bbb6bf4"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c865a7ee6f93783bd5d781af5a4c43dadc37053a5b42f7d18dc019f8c9d2bd1"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:162e498303d2b1c036b957a1278fa0899d02b2842f1ff901b6395104c5554a45"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f583bd01bbfbff4eaee0868e6fc607efdfcc2b03c1c766b06a707abbc856187"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b926dd38db1519ed3043a4de50214e0d600d404099c3392f098a7f9d75029ff8"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:716b542728d4c742353448765aa7cdaa519a7b82f9564130e2b3f6766018c9ec"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ad7f7ee1a13d9cb49d8198cd7d7e3aa93e425f371a68235f784e99741561f"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd87f48924f360e5d1c5f770d6155ce0e7d83f7b4e10c2f9ec001c73cf475c99"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0df446663464884297c793874573549229f9eca73b59360878f382a0fc085979"},
+    {file = "pydantic_core-2.16.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4df8a199d9f6afc5ae9a65f8f95ee52cae389a8c6b20163762bde0426275b7db"},
+    {file = "pydantic_core-2.16.3-cp310-none-win32.whl", hash = "sha256:456855f57b413f077dff513a5a28ed838dbbb15082ba00f80750377eed23d132"},
+    {file = "pydantic_core-2.16.3-cp310-none-win_amd64.whl", hash = "sha256:732da3243e1b8d3eab8c6ae23ae6a58548849d2e4a4e03a1924c8ddf71a387cb"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:519ae0312616026bf4cedc0fe459e982734f3ca82ee8c7246c19b650b60a5ee4"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b3992a322a5617ded0a9f23fd06dbc1e4bd7cf39bc4ccf344b10f80af58beacd"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d62da299c6ecb04df729e4b5c52dc0d53f4f8430b4492b93aa8de1f541c4aac"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2acca2be4bb2f2147ada8cac612f8a98fc09f41c89f87add7256ad27332c2fda"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1b662180108c55dfbf1280d865b2d116633d436cfc0bba82323554873967b340"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7c6ed0dc9d8e65f24f5824291550139fe6f37fac03788d4580da0d33bc00c97"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b1bb0827f56654b4437955555dc3aeeebeddc47c2d7ed575477f082622c49e"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e56f8186d6210ac7ece503193ec84104da7ceb98f68ce18c07282fcc2452e76f"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:936e5db01dd49476fa8f4383c259b8b1303d5dd5fb34c97de194560698cc2c5e"},
+    {file = "pydantic_core-2.16.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:33809aebac276089b78db106ee692bdc9044710e26f24a9a2eaa35a0f9fa70ba"},
+    {file = "pydantic_core-2.16.3-cp311-none-win32.whl", hash = "sha256:ded1c35f15c9dea16ead9bffcde9bb5c7c031bff076355dc58dcb1cb436c4721"},
+    {file = "pydantic_core-2.16.3-cp311-none-win_amd64.whl", hash = "sha256:d89ca19cdd0dd5f31606a9329e309d4fcbb3df860960acec32630297d61820df"},
+    {file = "pydantic_core-2.16.3-cp311-none-win_arm64.whl", hash = "sha256:6162f8d2dc27ba21027f261e4fa26f8bcb3cf9784b7f9499466a311ac284b5b9"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:0f56ae86b60ea987ae8bcd6654a887238fd53d1384f9b222ac457070b7ac4cff"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9bd22a2a639e26171068f8ebb5400ce2c1bc7d17959f60a3b753ae13c632975"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4204e773b4b408062960e65468d5346bdfe139247ee5f1ca2a378983e11388a2"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f651dd19363c632f4abe3480a7c87a9773be27cfe1341aef06e8759599454120"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aaf09e615a0bf98d406657e0008e4a8701b11481840be7d31755dc9f97c44053"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8e47755d8152c1ab5b55928ab422a76e2e7b22b5ed8e90a7d584268dd49e9c6b"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:500960cb3a0543a724a81ba859da816e8cf01b0e6aaeedf2c3775d12ee49cade"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf6204fe865da605285c34cf1172879d0314ff267b1c35ff59de7154f35fdc2e"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d33dd21f572545649f90c38c227cc8631268ba25c460b5569abebdd0ec5974ca"},
+    {file = "pydantic_core-2.16.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:49d5d58abd4b83fb8ce763be7794d09b2f50f10aa65c0f0c1696c677edeb7cbf"},
+    {file = "pydantic_core-2.16.3-cp312-none-win32.whl", hash = "sha256:f53aace168a2a10582e570b7736cc5bef12cae9cf21775e3eafac597e8551fbe"},
+    {file = "pydantic_core-2.16.3-cp312-none-win_amd64.whl", hash = "sha256:0d32576b1de5a30d9a97f300cc6a3f4694c428d956adbc7e6e2f9cad279e45ed"},
+    {file = "pydantic_core-2.16.3-cp312-none-win_arm64.whl", hash = "sha256:ec08be75bb268473677edb83ba71e7e74b43c008e4a7b1907c6d57e940bf34b6"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:b1f6f5938d63c6139860f044e2538baeee6f0b251a1816e7adb6cbce106a1f01"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2a1ef6a36fdbf71538142ed604ad19b82f67b05749512e47f247a6ddd06afdc7"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704d35ecc7e9c31d48926150afada60401c55efa3b46cd1ded5a01bdffaf1d48"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d937653a696465677ed583124b94a4b2d79f5e30b2c46115a68e482c6a591c8a"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9803edf8e29bd825f43481f19c37f50d2b01899448273b3a7758441b512acf8"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:72282ad4892a9fb2da25defeac8c2e84352c108705c972db82ab121d15f14e6d"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f752826b5b8361193df55afcdf8ca6a57d0232653494ba473630a83ba50d8c9"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4384a8f68ddb31a0b0c3deae88765f5868a1b9148939c3f4121233314ad5532c"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4b2bf78342c40b3dc830880106f54328928ff03e357935ad26c7128bbd66ce8"},
+    {file = "pydantic_core-2.16.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:13dcc4802961b5f843a9385fc821a0b0135e8c07fc3d9949fd49627c1a5e6ae5"},
+    {file = "pydantic_core-2.16.3-cp38-none-win32.whl", hash = "sha256:e3e70c94a0c3841e6aa831edab1619ad5c511199be94d0c11ba75fe06efe107a"},
+    {file = "pydantic_core-2.16.3-cp38-none-win_amd64.whl", hash = "sha256:ecdf6bf5f578615f2e985a5e1f6572e23aa632c4bd1dc67f8f406d445ac115ed"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bda1ee3e08252b8d41fa5537413ffdddd58fa73107171a126d3b9ff001b9b820"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:21b888c973e4f26b7a96491c0965a8a312e13be108022ee510248fe379a5fa23"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be0ec334369316fa73448cc8c982c01e5d2a81c95969d58b8f6e272884df0074"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5b6079cc452a7c53dd378c6f881ac528246b3ac9aae0f8eef98498a75657805"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ee8d5f878dccb6d499ba4d30d757111847b6849ae07acdd1205fffa1fc1253c"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7233d65d9d651242a68801159763d09e9ec96e8a158dbf118dc090cd77a104c9"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6119dc90483a5cb50a1306adb8d52c66e447da88ea44f323e0ae1a5fcb14256"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:578114bc803a4c1ff9946d977c221e4376620a46cf78da267d946397dc9514a8"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d8f99b147ff3fcf6b3cc60cb0c39ea443884d5559a30b1481e92495f2310ff2b"},
+    {file = "pydantic_core-2.16.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4ac6b4ce1e7283d715c4b729d8f9dab9627586dafce81d9eaa009dd7f25dd972"},
+    {file = "pydantic_core-2.16.3-cp39-none-win32.whl", hash = "sha256:e7774b570e61cb998490c5235740d475413a1f6de823169b4cf94e2fe9e9f6b2"},
+    {file = "pydantic_core-2.16.3-cp39-none-win_amd64.whl", hash = "sha256:9091632a25b8b87b9a605ec0e61f241c456e9248bfdcf7abdf344fdb169c81cf"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:36fa178aacbc277bc6b62a2c3da95226520da4f4e9e206fdf076484363895d2c"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:dcca5d2bf65c6fb591fff92da03f94cd4f315972f97c21975398bd4bd046854a"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a72fb9963cba4cd5793854fd12f4cfee731e86df140f59ff52a49b3552db241"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b60cc1a081f80a2105a59385b92d82278b15d80ebb3adb200542ae165cd7d183"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cbcc558401de90a746d02ef330c528f2e668c83350f045833543cd57ecead1ad"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:fee427241c2d9fb7192b658190f9f5fd6dfe41e02f3c1489d2ec1e6a5ab1e04a"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f4cb85f693044e0f71f394ff76c98ddc1bc0953e48c061725e540396d5c8a2e1"},
+    {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b29eeb887aa931c2fcef5aa515d9d176d25006794610c264ddc114c053bf96fe"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a425479ee40ff021f8216c9d07a6a3b54b31c8267c6e17aa88b70d7ebd0e5e5b"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5c5cbc703168d1b7a838668998308018a2718c2130595e8e190220238addc96f"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99b6add4c0b39a513d323d3b93bc173dac663c27b99860dd5bf491b240d26137"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f76ee558751746d6a38f89d60b6228fa174e5172d143886af0f85aa306fd89"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:00ee1c97b5364b84cb0bd82e9bbf645d5e2871fb8c58059d158412fee2d33d8a"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:287073c66748f624be4cef893ef9174e3eb88fe0b8a78dc22e88eca4bc357ca6"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ed25e1835c00a332cb10c683cd39da96a719ab1dfc08427d476bce41b92531fc"},
+    {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:86b3d0033580bd6bbe07590152007275bd7af95f98eaa5bd36f3da219dcd93da"},
+    {file = "pydantic_core-2.16.3.tar.gz", hash = "sha256:1cac689f80a3abab2d3c0048b29eea5751114054f032a941a32de4c852c59cad"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
 
 [[package]]
 name = "pytest"
@@ -728,6 +797,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -800,19 +870,19 @@ pytest = ">=7.0.0,<8.0.0"
 
 [[package]]
 name = "text-generation"
-version = "0.6.0"
+version = "0.6.1"
 description = "Hugging Face Text Generation Python Client"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
-    {file = "text-generation-0.6.0.tar.gz", hash = "sha256:48560e7a67b9a88b38335382d357f66e23b5a75f53971ccd436fc6f696a00815"},
-    {file = "text_generation-0.6.0-py3-none-any.whl", hash = "sha256:42ae7f7c9ff11f3a6c9d210f94fe708fe693eede79c6776da727456da1606ef9"},
+    {file = "text_generation-0.6.1-py3-none-any.whl", hash = "sha256:ebca00587eeabc0f5118f66ee1048bf690bd7735a9a10361c533c31c8c0bf994"},
+    {file = "text_generation-0.6.1.tar.gz", hash = "sha256:730e662aa7812f73c08ab953e008e90455f3d046f81efa0ef3de462bd4cf63d9"},
 ]
 
 [package.dependencies]
 aiohttp = ">=3.8,<4.0"
 huggingface-hub = ">=0.12,<1.0"
-pydantic = ">=1.10,<2.0"
+pydantic = ">1.10,<3"
 
 [[package]]
 name = "tomli"
@@ -979,4 +1049,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "bdad1d22d29138010cd6b11e1b92dc0630b35634422413a8456dc85a15bee05e"
+content-hash = "421fbce065cb1499c666599cf0fd83a5ce8fb3bed09e83c16c3a3d6953b34026"
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
index bb881d8e..88e9761a 100644
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@@ -1,10 +1,11 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.0.3"
+version = "2.0.1"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]
 
 [tool.poetry.dependencies]
+pydantic = "> 2, < 3"
 python = ">=3.9,<3.13"
 syrupy = "4.0.1"
 text-generation = "^0.6.0"
diff --git a/integration-tests/pytest.ini b/integration-tests/pytest.ini
index 7dcae663..bab689d7 100644
--- a/integration-tests/pytest.ini
+++ b/integration-tests/pytest.ini
@@ -2,4 +2,4 @@
 addopts = --snapshot-warn-unused
 asyncio_mode = auto
 markers =
-    private: marks tests as requiring an admin hf token (deselect with '-m "not private"')
\ No newline at end of file
+    private: marks tests as requiring an admin hf token (deselect with '-m "not private"')
diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
index 3f779a90..3c2ce11b 100644
--- a/integration-tests/requirements.txt
+++ b/integration-tests/requirements.txt
@@ -1,5 +1,6 @@
 aiohttp==3.8.5 ; python_version >= "3.9" and python_version < "3.13"
 aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
+annotated-types==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
 async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.13"
 attrs==23.1.0 ; python_version >= "3.9" and python_version < "3.13"
 certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
@@ -17,14 +18,15 @@ iniconfig==2.0.0 ; python_version >= "3.9" and python_version < "3.13"
 multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
 packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
 pluggy==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
-pydantic==1.10.12 ; python_version >= "3.9" and python_version < "3.13"
+pydantic-core==2.16.3 ; python_version >= "3.9" and python_version < "3.13"
+pydantic==2.6.4 ; python_version >= "3.9" and python_version < "3.13"
 pytest-asyncio==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
 pytest==7.4.0 ; python_version >= "3.9" and python_version < "3.13"
 pywin32==306 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
 syrupy==4.0.1 ; python_version >= "3.9" and python_version < "3.13"
-text-generation==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+text-generation==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 tomli==2.0.1 ; python_version >= "3.9" and python_version < "3.11"
 tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index 3e7f86d4..eb219423 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -7,17 +7,20 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
-clap = { version = "4.1.4", features = ["derive", "env"] }
-ctrlc = { version = "3.2.5", features = ["termination"] }
-nix = "0.26.2"
-serde = { version = "1.0.152", features = ["derive"]  }
-serde_json = "1.0.93"
+clap = { version = "4.4.5", features = ["derive", "env"] }
+ctrlc = { version = "3.4.1", features = ["termination"] }
+hf-hub = "0.3.2"
+nix = { version = "0.28.0", features = ["signal"] }
+once_cell = "1.19.0"
+serde = { version = "1.0.188", features = ["derive"] }
+serde_json = "1.0.107"
+thiserror = "1.0.59"
 tracing = "0.1.37"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
+tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 
 [dev-dependencies]
 float_eq = "1.0.1"
-reqwest = { version = "0.11.14", features = ["blocking", "json"] }
+reqwest = { version = "0.11.20", features = ["blocking", "json"] }
 
 [build-dependencies]
-vergen = { version = "8.0.0", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
+vergen = { version = "8.2.5", features = ["build", "cargo", "git", "gitcl", "rustc", "si"] }
diff --git a/launcher/src/env_runtime.rs b/launcher/src/env_runtime.rs
index 9dbc83f7..08fb301c 100644
--- a/launcher/src/env_runtime.rs
+++ b/launcher/src/env_runtime.rs
@@ -7,14 +7,17 @@ pub(crate) struct Env {
     git_sha: &'static str,
     docker_label: &'static str,
     nvidia_env: String,
+    xpu_env: String,
 }
 
 impl Env {
     pub fn new() -> Self {
         let nvidia_env = nvidia_smi();
+        let xpu_env = xpu_smi();
 
         Self {
             nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
+            xpu_env: xpu_env.unwrap_or("N/A".to_string()),
             cargo_target: env!("VERGEN_CARGO_TARGET_TRIPLE"),
             cargo_version: env!("VERGEN_RUSTC_SEMVER"),
             git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
@@ -31,7 +34,8 @@ impl fmt::Display for Env {
         writeln!(f, "Cargo version: {}", self.cargo_version)?;
         writeln!(f, "Commit sha: {}", self.git_sha)?;
         writeln!(f, "Docker label: {}", self.docker_label)?;
-        write!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
+        writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
+        write!(f, "xpu-smi:\n{}", self.xpu_env)?;
 
         Ok(())
     }
@@ -43,3 +47,10 @@ fn nvidia_smi() -> Option<String> {
     let output = nvidia_smi.replace('\n', "\n   ");
     Some(output.trim().to_string())
 }
+
+fn xpu_smi() -> Option<String> {
+    let output = Command::new("xpu-smi").arg("discovery").output().ok()?;
+    let xpu_smi = String::from_utf8(output.stdout).ok()?;
+    let output = xpu_smi.replace('\n', "\n   ");
+    Some(output.trim().to_string())
+}
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index cbb6f25d..d2ca38e5 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1,10 +1,11 @@
 use clap::{Parser, ValueEnum};
+use hf_hub::{api::sync::Api, Repo, RepoType};
 use nix::sys::signal::{self, Signal};
 use nix::unistd::Pid;
 use serde::Deserialize;
 use std::env;
 use std::ffi::OsString;
-use std::io::{BufRead, BufReader, Lines, Read};
+use std::io::{BufRead, BufReader, Lines};
 use std::os::unix::process::{CommandExt, ExitStatusExt};
 use std::path::Path;
 use std::process::{Child, Command, ExitStatus, Stdio};
@@ -15,22 +16,82 @@ use std::thread;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
 use std::{fs, io};
-use tracing_subscriber::EnvFilter;
+use thiserror::Error;
+use tracing_subscriber::{filter::LevelFilter, EnvFilter};
 
 mod env_runtime;
 
+#[derive(Deserialize)]
+struct RawConfig {
+    max_position_embeddings: Option<usize>,
+    n_positions: Option<usize>,
+    max_seq_len: Option<usize>,
+}
+
+#[derive(Deserialize)]
+struct Config {
+    max_position_embeddings: Option<usize>,
+}
+
+impl From<RawConfig> for Config {
+    fn from(other: RawConfig) -> Self {
+        let max_position_embeddings = other
+            .max_position_embeddings
+            .or(other.max_seq_len)
+            .or(other.n_positions);
+        Config {
+            max_position_embeddings,
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum Quantization {
-    Bitsandbytes,
-    BitsandbytesNF4,
-    BitsandbytesFP4,
+    /// 4 bit quantization. Requires a specific AWQ quantized model:
+    ///   <https://hf.co/models?search=awq>.
+    /// Should replace GPTQ models wherever possible because of the better latency
+    Awq,
+    /// 8 bit quantization, doesn't require specific model.
+    /// Should be a drop-in replacement to bitsandbytes with much better performance.
+    /// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+    Eetq,
+    /// Variable bit quantization. Requires a specific EXL2 quantized model:
+    /// <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does
+    /// not support tensor parallelism (num_shard > 1).
+    Exl2,
+    /// 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>.
+    /// text-generation-inference will use exllama (faster) kernels wherever possible, and use
+    /// triton kernel (wider support) when it's not.
+    /// AWQ has faster kernels.
     Gptq,
+    /// 4 bit quantization. Requires a specific Marlin quantized model: <https://hf.co/models?search=marlin>.
+    Marlin,
+    /// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
+    /// but it is known that the model will be much slower to run than the native f16.
+    #[deprecated(
+        since = "1.1.0",
+        note = "Use `eetq` instead, which provides better latencies overall and is drop-in in most cases"
+    )]
+    Bitsandbytes,
+    /// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
+    /// but it is known that the model will be much slower to run than the native f16.
+    BitsandbytesNF4,
+    /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
+    /// perplexity performance for you model
+    BitsandbytesFP4,
+    /// [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above
+    /// This dtype has native ops should be the fastest if available.
+    /// This is currently not the fastest because of local unpacking + padding to satisfy matrix
+    /// multiplication limitations.
+    Fp8,
 }
 
 impl std::fmt::Display for Quantization {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         // To keep in track with `server`.
         match self {
+            #[allow(deprecated)]
+            // Use `eetq` instead, which provides better latencies overall and is drop-in in most cases
             Quantization::Bitsandbytes => {
                 write!(f, "bitsandbytes")
             }
@@ -40,9 +101,24 @@ impl std::fmt::Display for Quantization {
             Quantization::BitsandbytesFP4 => {
                 write!(f, "bitsandbytes-fp4")
             }
+            Quantization::Exl2 => {
+                write!(f, "exl2")
+            }
             Quantization::Gptq => {
                 write!(f, "gptq")
             }
+            Quantization::Marlin => {
+                write!(f, "marlin")
+            }
+            Quantization::Awq => {
+                write!(f, "awq")
+            }
+            Quantization::Eetq => {
+                write!(f, "eetq")
+            }
+            Quantization::Fp8 => {
+                write!(f, "fp8")
+            }
         }
     }
 }
@@ -123,12 +199,17 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Whether you want the model to be quantized. This will use `bitsandbytes` for
-    /// quantization on the fly, or `gptq`. 4bit quantization is available through
-    /// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
+    /// Whether you want the model to be quantized.
     #[clap(long, env, value_enum)]
     quantize: Option<Quantization>,
 
+    /// The number of input_ids to speculate on
+    /// If using a medusa model, the heads will be picked up automatically
+    /// Other wise, it will use n-gram speculation which is relatively free
+    /// in terms of compute, but the speedup heavily depends on the task.
+    #[clap(long, env)]
+    speculate: Option<usize>,
+
     /// The dtype to be forced upon the model. This option cannot be used with `--quantize`.
     #[clap(long, env, value_enum)]
     dtype: Option<Dtype>,
@@ -160,7 +241,7 @@ struct Args {
     max_stop_sequences: usize,
 
     /// This is the maximum allowed value for clients to set `top_n_tokens`.
-    /// `top_n_tokens is used to return information about the the `n` most likely
+    /// `top_n_tokens` is used to return information about the the `n` most likely
     /// tokens at each generation step, instead of just the sampled token. This
     /// information can be used for downstream tasks like for classification or
     /// ranking.
@@ -171,8 +252,13 @@ struct Args {
     /// for users. The larger this value, the longer prompt users can send which
     /// can impact the overall memory required to handle the load.
     /// Please note that some models have a finite range of sequence they can handle.
-    #[clap(default_value = "1024", long, env)]
-    max_input_length: usize,
+    /// Default to min(max_position_embeddings - 1, 4095)
+    #[clap(long, env)]
+    max_input_tokens: Option<usize>,
+
+    /// Legacy version of [`Args::max_input_tokens`].
+    #[clap(long, env)]
+    max_input_length: Option<usize>,
 
     /// This is the most important value to set as it defines the "memory budget"
     /// of running clients requests.
@@ -182,8 +268,9 @@ struct Args {
     /// `1511` max_new_tokens.
     /// The larger this value, the larger amount each request will be in your RAM
     /// and the less effective batching can be.
-    #[clap(default_value = "2048", long, env)]
-    max_total_tokens: usize,
+    /// Default to min(max_position_embeddings, 4096)
+    #[clap(long, env)]
+    max_total_tokens: Option<usize>,
 
     /// This represents the ratio of waiting queries vs running queries where
     /// you want to start considering pausing the running queries to include the waiting
@@ -195,14 +282,15 @@ struct Args {
     ///
     /// This setting is only applied if there is room in the batch
     /// as defined by `max_batch_total_tokens`.
-    #[clap(default_value = "1.2", long, env)]
+    #[clap(default_value = "0.3", long, env)]
     waiting_served_ratio: f32,
 
     /// Limits the number of tokens for the prefill operation.
     /// Since this operation take the most memory and is compute bound, it is interesting
     /// to limit the number of requests that can be sent.
-    #[clap(default_value = "4096", long, env)]
-    max_batch_prefill_tokens: u32,
+    /// Default to `max_input_tokens + 50` to give a bit of room.
+    #[clap(long, env)]
+    max_batch_prefill_tokens: Option<u32>,
 
     /// **IMPORTANT** This is one critical control to allow maximum usage
     /// of the available hardware.
@@ -244,6 +332,17 @@ struct Args {
     #[clap(default_value = "20", long, env)]
     max_waiting_tokens: usize,
 
+    /// Enforce a maximum number of requests per batch
+    /// Specific flag for hardware targets that do not support unpadded inference
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
+
+    /// Specify the batch sizes to compute cuda graphs for.
+    /// Use "0" to disable.
+    /// Default = "1,2,4,8,16,32"
+    #[clap(long, env, value_delimiter = ',')]
+    cuda_graphs: Option<Vec<usize>>,
+
     /// The IP address to listen on
     #[clap(default_value = "0.0.0.0", long, env)]
     hostname: String,
@@ -314,6 +413,9 @@ struct Args {
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
 
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
+
     #[clap(long, env)]
     cors_allow_origin: Vec<String>,
     #[clap(long, env)]
@@ -333,9 +435,28 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
 
+    /// The path to the tokenizer config file. This path is used to load the tokenizer configuration which may
+    /// include a `chat_template`. If not provided, the default config will be used from the model hub.
+    #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+
+    /// Disable outlines grammar constrained generation.
+    /// This is a feature that allows you to generate text that follows a specific grammar.
+    #[clap(long, env)]
+    disable_grammar_support: bool,
+
     /// Display a lot of information about your runtime environment
     #[clap(long, short, action)]
     env: bool,
+
+    /// Control the maximum number of inputs that a client can send in a single request
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
+
+    /// Lora Adapters a list of adapter ids i.e. `repo/adapter1,repo/adapter2` to load during
+    /// startup that will be available to callers via the `adapter_id` field in a request.
+    #[clap(long, env)]
+    lora_adapters: Option<String>,
 }
 
 #[derive(Debug)]
@@ -349,6 +470,7 @@ fn shard_manager(
     model_id: String,
     revision: Option<String>,
     quantize: Option<Quantization>,
+    speculate: Option<usize>,
     dtype: Option<Dtype>,
     trust_remote_code: bool,
     uds_path: String,
@@ -361,10 +483,17 @@ fn shard_manager(
     disable_custom_kernels: bool,
     watermark_gamma: Option<f32>,
     watermark_delta: Option<f32>,
+    cuda_graphs: Vec<usize>,
     cuda_memory_fraction: f32,
     rope_scaling: Option<RopeScaling>,
     rope_factor: Option<f32>,
+    max_total_tokens: usize,
+    max_batch_size: Option<usize>,
+    max_input_tokens: usize,
+    lora_adapters: Option<String>,
     otlp_endpoint: Option<String>,
+    otlp_service_name: String,
+    log_level: LevelFilter,
     status_sender: mpsc::Sender<ShardStatus>,
     shutdown: Arc<AtomicBool>,
     _shutdown_sender: mpsc::Sender<()>,
@@ -387,7 +516,7 @@ fn shard_manager(
         "--uds-path".to_string(),
         uds_path,
         "--logger-level".to_string(),
-        "INFO".to_string(),
+        log_level.to_string().to_uppercase(),
         "--json-output".to_string(),
     ];
 
@@ -406,6 +535,11 @@ fn shard_manager(
         shard_args.push(quantize.to_string())
     }
 
+    if let Some(speculate) = speculate {
+        shard_args.push("--speculate".to_string());
+        shard_args.push(speculate.to_string())
+    }
+
     if let Some(dtype) = dtype {
         shard_args.push("--dtype".to_string());
         shard_args.push(dtype.to_string())
@@ -423,21 +557,33 @@ fn shard_manager(
         (Some(scaling), Some(factor)) => Some((scaling, factor)),
         (None, Some(factor)) => Some((RopeScaling::Linear, factor)),
     };
-    // OpenTelemetry
+
+    // OpenTelemetry Endpoint
     if let Some(otlp_endpoint) = otlp_endpoint {
         shard_args.push("--otlp-endpoint".to_string());
         shard_args.push(otlp_endpoint);
     }
 
+    // OpenTelemetry Service Name
+    shard_args.push("--otlp-service-name".to_string());
+    shard_args.push(otlp_service_name);
+
+    // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
+    shard_args.push("--max-input-tokens".to_string());
+    shard_args.push(max_input_tokens.to_string());
+
     // Copy current process env
     let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
 
+    // Remove LOG_LEVEL if present
+    envs.retain(|(name, _)| name != "LOG_LEVEL");
+
     // Torch Distributed Env vars
     envs.push(("RANK".into(), rank.to_string().into()));
     envs.push(("WORLD_SIZE".into(), world_size.to_string().into()));
     envs.push(("MASTER_ADDR".into(), master_addr.into()));
     envs.push(("MASTER_PORT".into(), master_port.to_string().into()));
-    envs.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()));
+    envs.push(("TORCH_NCCL_AVOID_RECORD_STREAMS".into(), "1".into()));
 
     // CUDA memory fraction
     envs.push((
@@ -448,6 +594,9 @@ fn shard_manager(
     // Safetensors load fast
     envs.push(("SAFETENSORS_FAST_GPU".into(), "1".into()));
 
+    // Disable progress bar
+    envs.push(("HF_HUB_DISABLE_PROGRESS_BARS".into(), "1".into()));
+
     // Enable hf transfer for insane download speeds
     let enable_hf_transfer = env::var("HF_HUB_ENABLE_HF_TRANSFER").unwrap_or("1".to_string());
     envs.push((
@@ -457,7 +606,7 @@ fn shard_manager(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // Detect rope scaling
@@ -469,6 +618,19 @@ fn shard_manager(
         envs.push(("ROPE_FACTOR".into(), factor.to_string().into()));
     }
 
+    envs.push((
+        "MAX_TOTAL_TOKENS".into(),
+        max_total_tokens.to_string().into(),
+    ));
+    if let Some(max_batch_size) = max_batch_size {
+        envs.push(("MAX_BATCH_SIZE".into(), max_batch_size.to_string().into()));
+    }
+
+    // Lora Adapters
+    if let Some(lora_adapters) = lora_adapters {
+        envs.push(("LORA_ADAPTERS".into(), lora_adapters.into()));
+    }
+
     // If huggingface_hub_cache is some, pass it to the shard
     // Useful when running inside a docker container
     if let Some(huggingface_hub_cache) = huggingface_hub_cache {
@@ -484,6 +646,19 @@ fn shard_manager(
         ));
     };
 
+    // Enable experimental support for cuda graphs
+    if !cuda_graphs.is_empty() {
+        envs.push((
+            "CUDA_GRAPHS".into(),
+            cuda_graphs
+                .into_iter()
+                .map(|c| c.to_string())
+                .collect::<Vec<_>>()
+                .join(",")
+                .into(),
+        ));
+    }
+
     // If disable_custom_kernels is true, pass it to the shard as an env var
     if disable_custom_kernels {
         envs.push(("DISABLE_CUSTOM_KERNELS".into(), "True".into()))
@@ -503,6 +678,7 @@ fn shard_manager(
     tracing::info!("Starting shard");
     let mut p = match Command::new("text-generation-server")
         .args(shard_args)
+        .env_clear()
         .envs(envs)
         .stdout(Stdio::piped())
         .stderr(Stdio::piped())
@@ -532,6 +708,13 @@ fn shard_manager(
     thread::spawn(move || {
         log_lines(shard_stdout_reader.lines());
     });
+    // We read stderr in another thread as it seems that lines() can block in some cases
+    let (err_sender, err_receiver) = mpsc::channel();
+    thread::spawn(move || {
+        for line in shard_stderr_reader.lines().map_while(Result::ok) {
+            err_sender.send(line).unwrap_or(());
+        }
+    });
 
     let mut ready = false;
     let start_time = Instant::now();
@@ -539,13 +722,6 @@ fn shard_manager(
     loop {
         // Process exited
         if let Some(exit_status) = p.try_wait().unwrap() {
-            // We read stderr in another thread as it seems that lines() can block in some cases
-            let (err_sender, err_receiver) = mpsc::channel();
-            thread::spawn(move || {
-                for line in shard_stderr_reader.lines().flatten() {
-                    err_sender.send(line).unwrap_or(());
-                }
-            });
             let mut err = String::new();
             while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) {
                 err = err + "\n" + &line;
@@ -563,9 +739,7 @@ fn shard_manager(
 
         // We received a shutdown signal
         if shutdown.load(Ordering::SeqCst) {
-            p.kill().unwrap();
-            let _ = p.wait();
-            tracing::info!("Shard terminated");
+            terminate("shard", p, Duration::from_secs(90)).unwrap();
             return;
         }
 
@@ -596,7 +770,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
 fn num_cuda_devices() -> Option<usize> {
     let devices = match env::var("CUDA_VISIBLE_DEVICES") {
         Ok(devices) => devices,
-        Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
+        Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
+            Ok(devices) => devices,
+            Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
+        },
     };
     let n_devices = devices.split(',').count();
     Some(n_devices)
@@ -633,13 +810,13 @@ struct PythonLogMessage {
 impl PythonLogMessage {
     fn trace(&self) {
         match self.record.level.name {
-            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text),
-            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text),
-            PythonLogLevelEnum::Info => tracing::info!("{}", self.text),
-            PythonLogLevelEnum::Success => tracing::info!("{}", self.text),
-            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text),
-            PythonLogLevelEnum::Error => tracing::error!("{}", self.text),
-            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text),
+            PythonLogLevelEnum::Trace => tracing::trace!("{}", self.text.trim_end()),
+            PythonLogLevelEnum::Debug => tracing::debug!("{}", self.text.trim_end()),
+            PythonLogLevelEnum::Info => tracing::info!("{}", self.text.trim_end()),
+            PythonLogLevelEnum::Success => tracing::info!("{}", self.text.trim_end()),
+            PythonLogLevelEnum::Warning => tracing::warn!("{}", self.text.trim_end()),
+            PythonLogLevelEnum::Error => tracing::error!("{}", self.text.trim_end()),
+            PythonLogLevelEnum::Critical => tracing::error!("{}", self.text.trim_end()),
         }
     }
 }
@@ -653,7 +830,7 @@ impl TryFrom<&String> for PythonLogMessage {
 }
 
 fn log_lines<S: Sized + BufRead>(lines: Lines<S>) {
-    for line in lines.flatten() {
+    for line in lines.map_while(Result::ok) {
         match PythonLogMessage::try_from(&line) {
             Ok(log) => log.trace(),
             Err(_) => tracing::debug!("{line}"),
@@ -669,9 +846,9 @@ fn find_num_shards(
     let num_shard = match (sharded, num_shard) {
         (Some(true), None) => {
             // try to default to the number of available GPUs
-            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
+            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
             let n_devices = num_cuda_devices()
-                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
+                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
             if n_devices <= 1 {
                 return Err(LauncherError::NotEnoughCUDADevices(format!(
                     "`sharded` is true but only found {n_devices} CUDA devices"
@@ -701,25 +878,40 @@ fn find_num_shards(
     Ok(num_shard)
 }
 
-#[derive(Debug)]
+#[derive(Debug, Error)]
 enum LauncherError {
+    #[error("Invalid argument: {0}")]
     ArgumentValidation(String),
+    #[error("not enough cuda devices: {0}")]
     NotEnoughCUDADevices(String),
+    #[error("Download error")]
     DownloadError,
+    #[error("Shard cannot start")]
     ShardCannotStart,
+    #[error("Shard disconnected")]
     ShardDisconnected,
+    #[error("Shard failed")]
     ShardFailed,
+    #[error("Webserver failed")]
     WebserverFailed,
+    #[error("Webserver cannot start")]
     WebserverCannotStart,
 }
 
-fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherError> {
+fn download_convert_model(
+    model_id: &str,
+    revision: Option<&str>,
+    trust_remote_code: bool,
+    huggingface_hub_cache: Option<&str>,
+    weights_cache_override: Option<&str>,
+    running: Arc<AtomicBool>,
+) -> Result<(), LauncherError> {
     // Enter download tracing span
     let _span = tracing::span!(tracing::Level::INFO, "download").entered();
 
     let mut download_args = vec![
         "download-weights".to_string(),
-        args.model_id.to_string(),
+        model_id.to_string(),
         "--extension".to_string(),
         ".safetensors".to_string(),
         "--logger-level".to_string(),
@@ -728,22 +920,28 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
     ];
 
     // Model optional revision
-    if let Some(revision) = &args.revision {
+    if let Some(revision) = &revision {
         download_args.push("--revision".to_string());
         download_args.push(revision.to_string())
     }
 
     // Trust remote code for automatic peft fusion
-    if args.trust_remote_code {
+    if trust_remote_code {
         download_args.push("--trust-remote-code".to_string());
     }
 
     // Copy current process env
     let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
 
+    // Remove LOG_LEVEL if present
+    envs.retain(|(name, _)| name != "LOG_LEVEL");
+
+    // Disable progress bar
+    envs.push(("HF_HUB_DISABLE_PROGRESS_BARS".into(), "1".into()));
+
     // If huggingface_hub_cache is set, pass it to the download process
     // Useful when running inside a docker container
-    if let Some(ref huggingface_hub_cache) = args.huggingface_hub_cache {
+    if let Some(ref huggingface_hub_cache) = huggingface_hub_cache {
         envs.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
     };
 
@@ -756,12 +954,12 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
     // If args.weights_cache_override is some, pass it to the download process
     // Useful when running inside a HuggingFace Inference Endpoint
-    if let Some(weights_cache_override) = &args.weights_cache_override {
+    if let Some(weights_cache_override) = &weights_cache_override {
         envs.push((
             "WEIGHTS_CACHE_OVERRIDE".into(),
             weights_cache_override.into(),
@@ -769,9 +967,10 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
     };
 
     // Start process
-    tracing::info!("Starting download process.");
+    tracing::info!("Starting check and download process for {model_id}");
     let mut download_process = match Command::new("text-generation-server")
         .args(download_args)
+        .env_clear()
         .envs(envs)
         .stdout(Stdio::piped())
         .stderr(Stdio::piped())
@@ -791,28 +990,34 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
         }
     };
 
-    // Redirect STDOUT to the console
-    let download_stdout = download_process.stdout.take().unwrap();
-    let stdout = BufReader::new(download_stdout);
+    let download_stdout = BufReader::new(download_process.stdout.take().unwrap());
 
     thread::spawn(move || {
-        log_lines(stdout.lines());
+        log_lines(download_stdout.lines());
+    });
+
+    let download_stderr = BufReader::new(download_process.stderr.take().unwrap());
+
+    // We read stderr in another thread as it seems that lines() can block in some cases
+    let (err_sender, err_receiver) = mpsc::channel();
+    thread::spawn(move || {
+        for line in download_stderr.lines().map_while(Result::ok) {
+            err_sender.send(line).unwrap_or(());
+        }
     });
 
     loop {
         if let Some(status) = download_process.try_wait().unwrap() {
             if status.success() {
-                tracing::info!("Successfully downloaded weights.");
+                tracing::info!("Successfully downloaded weights for {model_id}");
                 break;
             }
 
             let mut err = String::new();
-            download_process
-                .stderr
-                .take()
-                .unwrap()
-                .read_to_string(&mut err)
-                .unwrap();
+            while let Ok(line) = err_receiver.recv_timeout(Duration::from_millis(10)) {
+                err = err + "\n" + &line;
+            }
+
             if let Some(signal) = status.signal() {
                 tracing::error!(
                     "Download process was signaled to shutdown with signal {signal}: {err}"
@@ -836,6 +1041,10 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
 fn spawn_shards(
     num_shard: usize,
     args: &Args,
+    cuda_graphs: Vec<usize>,
+    max_total_tokens: usize,
+    max_input_tokens: usize,
+    max_log_level: LevelFilter,
     shutdown: Arc<AtomicBool>,
     shutdown_receiver: &mpsc::Receiver<()>,
     shutdown_sender: mpsc::Sender<()>,
@@ -855,21 +1064,27 @@ fn spawn_shards(
         let shutdown = shutdown.clone();
         let shutdown_sender = shutdown_sender.clone();
         let otlp_endpoint = args.otlp_endpoint.clone();
+        let otlp_service_name = args.otlp_service_name.clone();
         let quantize = args.quantize;
+        let speculate = args.speculate;
         let dtype = args.dtype;
         let trust_remote_code = args.trust_remote_code;
         let master_port = args.master_port;
         let disable_custom_kernels = args.disable_custom_kernels;
         let watermark_gamma = args.watermark_gamma;
         let watermark_delta = args.watermark_delta;
+        let cuda_graphs_clone = cuda_graphs.clone();
         let cuda_memory_fraction = args.cuda_memory_fraction;
         let rope_scaling = args.rope_scaling;
         let rope_factor = args.rope_factor;
+        let max_batch_size = args.max_batch_size;
+        let lora_adapters = args.lora_adapters.clone();
         thread::spawn(move || {
             shard_manager(
                 model_id,
                 revision,
                 quantize,
+                speculate,
                 dtype,
                 trust_remote_code,
                 uds_path,
@@ -882,10 +1097,17 @@ fn spawn_shards(
                 disable_custom_kernels,
                 watermark_gamma,
                 watermark_delta,
+                cuda_graphs_clone,
                 cuda_memory_fraction,
                 rope_scaling,
                 rope_factor,
+                max_total_tokens,
+                max_batch_size,
+                max_input_tokens,
+                lora_adapters,
                 otlp_endpoint,
+                otlp_service_name,
+                max_log_level,
                 status_sender,
                 shutdown,
                 shutdown_sender,
@@ -922,8 +1144,24 @@ fn spawn_shards(
     Ok(())
 }
 
+fn compute_type(num_shard: usize) -> Option<String> {
+    let output = Command::new("nvidia-smi")
+        .args(["--query-gpu=gpu_name", "--format=csv"])
+        .output()
+        .ok()?;
+    let output = String::from_utf8(output.stdout).ok()?;
+    let fullname = output.split('\n').nth(1)?;
+    let cardname = fullname.replace(' ', "-").to_lowercase();
+    let compute_type = format!("{num_shard}-{cardname}");
+    Some(compute_type)
+}
+
 fn spawn_webserver(
+    num_shard: usize,
     args: Args,
+    max_input_tokens: usize,
+    max_total_tokens: usize,
+    max_batch_prefill_tokens: u32,
     shutdown: Arc<AtomicBool>,
     shutdown_receiver: &mpsc::Receiver<()>,
 ) -> Result<Child, LauncherError> {
@@ -931,6 +1169,8 @@ fn spawn_webserver(
     // Start webserver
     tracing::info!("Starting Webserver");
     let mut router_args = vec![
+        "--max-client-batch-size".to_string(),
+        args.max_client_batch_size.to_string(),
         "--max-concurrent-requests".to_string(),
         args.max_concurrent_requests.to_string(),
         "--max-best-of".to_string(),
@@ -939,12 +1179,12 @@ fn spawn_webserver(
         args.max_stop_sequences.to_string(),
         "--max-top-n-tokens".to_string(),
         args.max_top_n_tokens.to_string(),
-        "--max-input-length".to_string(),
-        args.max_input_length.to_string(),
+        "--max-input-tokens".to_string(),
+        max_input_tokens.to_string(),
         "--max-total-tokens".to_string(),
-        args.max_total_tokens.to_string(),
+        max_total_tokens.to_string(),
         "--max-batch-prefill-tokens".to_string(),
-        args.max_batch_prefill_tokens.to_string(),
+        max_batch_prefill_tokens.to_string(),
         "--waiting-served-ratio".to_string(),
         args.waiting_served_ratio.to_string(),
         "--max-waiting-tokens".to_string(),
@@ -961,12 +1201,29 @@ fn spawn_webserver(
         args.model_id,
     ];
 
+    // Grammar support
+    if args.disable_grammar_support {
+        router_args.push("--disable-grammar-support".to_string());
+    }
+
+    // Tokenizer config path
+    if let Some(ref tokenizer_config_path) = args.tokenizer_config_path {
+        router_args.push("--tokenizer-config-path".to_string());
+        router_args.push(tokenizer_config_path.to_string());
+    }
+
     // Model optional max batch total tokens
     if let Some(max_batch_total_tokens) = args.max_batch_total_tokens {
         router_args.push("--max-batch-total-tokens".to_string());
         router_args.push(max_batch_total_tokens.to_string());
     }
 
+    // Router optional max batch size
+    if let Some(max_batch_size) = args.max_batch_size {
+        router_args.push("--max-batch-size".to_string());
+        router_args.push(max_batch_size.to_string());
+    }
+
     // Model optional revision
     if let Some(ref revision) = args.revision {
         router_args.push("--revision".to_string());
@@ -983,6 +1240,11 @@ fn spawn_webserver(
         router_args.push(otlp_endpoint);
     }
 
+    // OpenTelemetry
+    let otlp_service_name = args.otlp_service_name;
+    router_args.push("--otlp-service-name".to_string());
+    router_args.push(otlp_service_name);
+
     // CORS origins
     for origin in args.cors_allow_origin.into_iter() {
         router_args.push("--cors-allow-origin".to_string());
@@ -1003,9 +1265,16 @@ fn spawn_webserver(
 
     // Parse Inference API token
     if let Ok(api_token) = env::var("HF_API_TOKEN") {
-        envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
+        envs.push(("HF_TOKEN".into(), api_token.into()))
     };
 
+    // Parse Compute type
+    if let Ok(compute_type) = env::var("COMPUTE_TYPE") {
+        envs.push(("COMPUTE_TYPE".into(), compute_type.into()))
+    } else if let Some(compute_type) = compute_type(num_shard) {
+        envs.push(("COMPUTE_TYPE".into(), compute_type.into()))
+    }
+
     let mut webserver = match Command::new("text-generation-router")
         .args(router_args)
         .envs(envs)
@@ -1053,7 +1322,6 @@ fn terminate(process_name: &str, mut process: Child, timeout: Duration) -> io::R
     signal::kill(Pid::from_raw(process.id() as i32), Signal::SIGTERM).unwrap();
 
     tracing::info!("Waiting for {process_name} to gracefully shutdown");
-
     while terminate_time.elapsed() < timeout {
         if let Some(status) = process.try_wait()? {
             tracing::info!("{process_name} terminated");
@@ -1061,7 +1329,6 @@ fn terminate(process_name: &str, mut process: Child, timeout: Duration) -> io::R
         }
         sleep(Duration::from_millis(100));
     }
-
     tracing::info!("Killing {process_name}");
 
     process.kill()?;
@@ -1076,8 +1343,22 @@ fn main() -> Result<(), LauncherError> {
     let args: Args = Args::parse();
 
     // Filter events with LOG_LEVEL
-    let env_filter =
-        EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
+    let varname = "LOG_LEVEL";
+    let env_filter = if let Ok(log_level) = std::env::var(varname) {
+        // Override to avoid simple logs to be spammed with tokio level informations
+        let log_level = match &log_level[..] {
+            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
+            "info" => "text_generation_launcher=info,text_generation_router=info",
+            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
+            log_level => log_level,
+        };
+        EnvFilter::builder()
+            .with_default_directive(LevelFilter::INFO.into())
+            .parse_lossy(log_level)
+    } else {
+        EnvFilter::new("info")
+    };
+    let max_log_level = env_filter.max_level_hint().unwrap_or(LevelFilter::INFO);
 
     if args.json_output {
         tracing_subscriber::fmt()
@@ -1096,21 +1377,133 @@ fn main() -> Result<(), LauncherError> {
         tracing::info!("{}", env_runtime);
     }
 
-    tracing::info!("{:?}", args);
+    tracing::info!("{:#?}", args);
+
+    let get_max_position_embeddings = || -> Result<usize, Box<dyn std::error::Error>> {
+        let model_id = args.model_id.clone();
+        let mut path = std::path::Path::new(&args.model_id).to_path_buf();
+        let filename = if !path.exists() {
+            // Assume it's a hub id
+            let api = Api::new()?;
+            let repo = if let Some(ref revision) = args.revision {
+                api.repo(Repo::with_revision(
+                    model_id,
+                    RepoType::Model,
+                    revision.to_string(),
+                ))
+            } else {
+                api.model(model_id)
+            };
+            repo.get("config.json")?
+        } else {
+            path.push("config.json");
+            path
+        };
+
+        let content = std::fs::read_to_string(filename)?;
+        let config: RawConfig = serde_json::from_str(&content)?;
+        let config: Config = config.into();
+
+        // Quantization usually means you're even more RAM constrained.
+        let max_default = 4096;
+
+        if let Some(max_position_embeddings) = config.max_position_embeddings {
+            if max_position_embeddings > max_default {
+                let max = max_position_embeddings;
+                if args.max_input_tokens.is_none()
+                    && args.max_total_tokens.is_none()
+                    && args.max_batch_prefill_tokens.is_none()
+                {
+                    tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
+                }
+                Ok(max_default)
+            } else {
+                Ok(max_position_embeddings)
+            }
+        } else {
+            Err(Box::new(LauncherError::ArgumentValidation(
+                "no max defined".to_string(),
+            )))
+        }
+    };
+    let max_position_embeddings: usize = get_max_position_embeddings().unwrap_or(4096);
+
+    let max_input_tokens = {
+        match (args.max_input_tokens, args.max_input_length) {
+            (Some(max_input_tokens), Some(max_input_length)) => {
+                return Err(LauncherError::ArgumentValidation(
+                    format!("Both `max_input_tokens` ({max_input_tokens}) and `max_input_length` ({max_input_length}) are set. Please define only `max_input_tokens` as `max_input_length is deprecated for naming consistency.",
+                )));
+            }
+            (Some(max_input_tokens), None) | (None, Some(max_input_tokens)) => max_input_tokens,
+            (None, None) => {
+                let value = max_position_embeddings - 1;
+                tracing::info!("Default `max_input_tokens` to {value}");
+                value
+            }
+        }
+    };
+    let max_total_tokens = {
+        match args.max_total_tokens {
+            Some(max_total_tokens) => max_total_tokens,
+            None => {
+                let value = max_position_embeddings;
+                tracing::info!("Default `max_total_tokens` to {value}");
+                value
+            }
+        }
+    };
+    let max_batch_prefill_tokens = {
+        match args.max_batch_prefill_tokens {
+            Some(max_batch_prefill_tokens) => max_batch_prefill_tokens,
+            None => {
+                let value: u32 = if let Some(max_batch_size) = args.max_batch_size {
+                    max_batch_size * max_input_tokens
+                } else {
+                    // Adding some edge in order to account for potential block_size alignement
+                    // issue.
+                    max_input_tokens + 50
+                } as u32;
+                tracing::info!("Default `max_batch_prefill_tokens` to {value}");
+                value
+            }
+        }
+    };
 
     // Validate args
-    if args.max_input_length >= args.max_total_tokens {
+    if max_input_tokens >= max_total_tokens {
         return Err(LauncherError::ArgumentValidation(
-            "`max_input_length` must be < `max_total_tokens`".to_string(),
+            "`max_input_tokens must be < `max_total_tokens`".to_string(),
         ));
     }
-    if args.max_input_length as u32 > args.max_batch_prefill_tokens {
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
         return Err(LauncherError::ArgumentValidation(format!(
-            "`max_batch_prefill_tokens` must be >= `max_input_length`. Given: {} and {}",
-            args.max_batch_prefill_tokens, args.max_input_length
+            "`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {} and {}",
+            max_batch_prefill_tokens, max_input_tokens
         )));
     }
 
+    let cuda_graphs = match (&args.cuda_graphs, &args.quantize) {
+        (Some(cuda_graphs), _) => cuda_graphs.iter().cloned().filter(|&c| c > 0).collect(),
+        #[allow(deprecated)]
+        (
+            None,
+            Some(
+                Quantization::Bitsandbytes
+                | Quantization::BitsandbytesNF4
+                | Quantization::BitsandbytesFP4,
+            ),
+        ) => {
+            tracing::info!("Bitsandbytes doesn't work with cuda graphs, deactivating them");
+            vec![]
+        }
+        _ => {
+            let cuda_graphs = vec![1, 2, 4, 8, 16, 32];
+            tracing::info!("Using default cuda graphs {cuda_graphs:?}");
+            cuda_graphs
+        }
+    };
+
     if args.validation_workers == 0 {
         return Err(LauncherError::ArgumentValidation(
             "`validation_workers` must be > 0".to_string(),
@@ -1125,20 +1518,25 @@ fn main() -> Result<(), LauncherError> {
 
     let num_shard = find_num_shards(args.sharded, args.num_shard)?;
     if num_shard > 1 {
+        if matches!(args.quantize, Some(Quantization::Exl2)) {
+            return Err(LauncherError::ArgumentValidation(
+                "Sharding is currently not supported with `exl2` quantization".into(),
+            ));
+        }
         tracing::info!("Sharding model on {num_shard} processes");
     }
 
     if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens {
-        if args.max_batch_prefill_tokens > *max_batch_total_tokens {
+        if max_batch_prefill_tokens > *max_batch_total_tokens {
             return Err(LauncherError::ArgumentValidation(format!(
                 "`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
-                args.max_batch_prefill_tokens, max_batch_total_tokens
+                max_batch_prefill_tokens, max_batch_total_tokens
             )));
         }
-        if args.max_total_tokens as u32 > *max_batch_total_tokens {
+        if max_total_tokens as u32 > *max_batch_total_tokens {
             return Err(LauncherError::ArgumentValidation(format!(
                 "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
-                args.max_total_tokens, max_batch_total_tokens
+                max_total_tokens, max_batch_total_tokens
             )));
         }
     }
@@ -1166,7 +1564,28 @@ fn main() -> Result<(), LauncherError> {
     .expect("Error setting Ctrl-C handler");
 
     // Download and convert model weights
-    download_convert_model(&args, running.clone())?;
+    download_convert_model(
+        &args.model_id,
+        args.revision.as_deref(),
+        args.trust_remote_code,
+        args.huggingface_hub_cache.as_deref(),
+        args.weights_cache_override.as_deref(),
+        running.clone(),
+    )?;
+
+    // Download and convert lora adapters if any
+    if let Some(lora_adapters) = &args.lora_adapters {
+        for adapter in lora_adapters.split(',') {
+            download_convert_model(
+                adapter,
+                None,
+                args.trust_remote_code,
+                args.huggingface_hub_cache.as_deref(),
+                args.weights_cache_override.as_deref(),
+                running.clone(),
+            )?;
+        }
+    }
 
     if !running.load(Ordering::SeqCst) {
         // Launcher was asked to stop
@@ -1185,6 +1604,10 @@ fn main() -> Result<(), LauncherError> {
     spawn_shards(
         num_shard,
         &args,
+        cuda_graphs,
+        max_total_tokens,
+        max_input_tokens,
+        max_log_level,
         shutdown.clone(),
         &shutdown_receiver,
         shutdown_sender,
@@ -1199,11 +1622,19 @@ fn main() -> Result<(), LauncherError> {
         return Ok(());
     }
 
-    let mut webserver =
-        spawn_webserver(args, shutdown.clone(), &shutdown_receiver).map_err(|err| {
-            shutdown_shards(shutdown.clone(), &shutdown_receiver);
-            err
-        })?;
+    let mut webserver = spawn_webserver(
+        num_shard,
+        args,
+        max_input_tokens,
+        max_total_tokens,
+        max_batch_prefill_tokens,
+        shutdown.clone(),
+        &shutdown_receiver,
+    )
+    .map_err(|err| {
+        shutdown_shards(shutdown.clone(), &shutdown_receiver);
+        err
+    })?;
 
     // Default exit code
     let mut exit_code = Ok(());
diff --git a/load_tests/Makefile b/load_tests/Makefile
new file mode 100644
index 00000000..9199aa3b
--- /dev/null
+++ b/load_tests/Makefile
@@ -0,0 +1,9 @@
+
+ShareGPT_V3_unfiltered_cleaned_split.json:
+	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
+	python filter.py
+
+prepare_orca:
+	python orca.py
diff --git a/load_tests/common.js b/load_tests/common.js
index be812e9b..e0a10595 100644
--- a/load_tests/common.js
+++ b/load_tests/common.js
@@ -1,64 +1,94 @@
-import { check, randomSeed } from 'k6';
+import { check } from 'k6';
+import { scenario } from 'k6/execution';
 import http from 'k6/http';
 import { Trend, Counter } from 'k6/metrics';
-import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
 
-const seed = 0;
-
-const host = __ENV.HOST || '127.0.0.1:8000';
+const host = __ENV.HOST;
+const model_id = __ENV.MODEL_ID;
 const timePerToken = new Trend('time_per_token', true);
-const throughput = new Counter('tokens_per_s');
+const tokens = new Counter('tokens');
+const new_tokens = new Counter('new_tokens');
+const input_tokens = new Counter('input_tokens');
+const max_new_tokens = 50;
 
-randomSeed(seed);
 // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
 const shareGPT = JSON.parse(open("small.json"))
 
 
-export function get_options(reference_latency_ms){
+export function get_options() {
     return {
         thresholds: {
             http_req_failed: ['rate==0'],
-            time_per_token: [{
-                threshold: `p(50)<${3 * reference_latency_ms}`,
-                abortOnFail: true,
-                delayAbortEval: '10s'
-            }],
+            // time_per_token: [{
+            //     threshold: `p(50)<${5 * reference_latency_ms}`,
+            //     abortOnFail: true,
+            //     delayAbortEval: '10s'
+            // }],
         },
         scenarios: {
+            // single_user: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 1,
+            //     rate: 20,
+            //     timeUnit: '1s',
+            // },
             load_test: {
                 executor: 'constant-arrival-rate',
                 duration: '60s',
                 preAllocatedVUs: 100,
-                rate: 10,
+                rate: 1,
                 timeUnit: '1s',
             },
+            // breakpoint: {
+            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
+            //     preAllocatedVUs: 300,
+            //     stages: [
+            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
+            //     ],
+            // },
+            // throughput: {
+            //     executor: 'shared-iterations',
+            //     vus: 100,
+            //     iterations: 200,
+            //     maxDuration: '40s',
+            // },
         },
     };
 }
 
+function generate_payload(gpt, max_new_tokens) {
+    const input = gpt["conversations"][0]["value"];
+    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
+}
 
-export function run(host, generate_payload, max_new_tokens) {
-    const headers = {'Content-Type': 'application/json'};
-    const query = randomItem(shareGPT);
-    const payload = JSON.stringify(generate_payload(query));
-    const res = http.post(`http://${host}/generate`, payload, {
+export const options = get_options();
+
+export default function run() {
+    const headers = { 'Content-Type': 'application/json' };
+    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
+    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
+    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
         headers,
     });
-    if(res.status >= 400 && res.status < 500){
+    if (res.status >= 400 && res.status < 500) {
         return;
     }
 
+
     check(res, {
-        'Post status is 200': (r) => res.status === 200,
+        'Post status is 200': (res) => res.status === 200,
     });
-    const n_tokens = max_new_tokens;
-    const timings = res.timings.duration;
+    const duration = res.timings.duration;
 
     if (res.status === 200) {
-        const latency_ms_per_token = timings / n_tokens;
+        const body = res.json();
+        const completion_tokens = body.usage.completion_tokens;
+        const latency_ms_per_token = duration / completion_tokens;
         timePerToken.add(latency_ms_per_token);
-        const latency_in_s = latency_ms_per_token / 1000;
-        const individual_throughput = 1 / latency_in_s;
-        throughput.add(individual_throughput);
+        const prompt_tokens = body.usage.prompt_tokens;
+        input_tokens.add(prompt_tokens);
+        new_tokens.add(completion_tokens);
+        tokens.add(completion_tokens + prompt_tokens);
     }
 }
diff --git a/load_tests/filter.py b/load_tests/filter.py
new file mode 100644
index 00000000..a00226ed
--- /dev/null
+++ b/load_tests/filter.py
@@ -0,0 +1,26 @@
+import json
+
+
+def main():
+    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
+        data = json.load(f)
+
+    # Select only the first 2k conversations that start with a human.
+    max = 2000
+    conversations = []
+    for conversation in data:
+        conv = conversation.get("conversations")
+        if conv and conv[0]["from"] == "human":
+            # Trim the rest of the output
+            conversation["conversations"] = conversation["conversations"][:1]
+            conversations.append(conversation)
+
+            if len(conversation) >= max:
+                break
+
+    with open("./small.json", "w") as f:
+        data = json.dump(conversations, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/load_tests/orca.py b/load_tests/orca.py
new file mode 100644
index 00000000..e607d27c
--- /dev/null
+++ b/load_tests/orca.py
@@ -0,0 +1,27 @@
+import json
+import datasets
+import tqdm
+
+
+def main():
+    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
+    # Select only the first 2k conversations that start with a human.
+    max = min(2000, len(dataset))
+    conversations = []
+    for item in tqdm.tqdm(dataset, total=max):
+        conversation = {
+            "conversations": [
+                {"from": "human", "value": item["question"]},
+            ],
+            "id": item["id"],
+        }
+        conversations.append(conversation)
+        if len(conversations) >= max:
+            break
+
+    with open("./small.json", "w") as f:
+        data = json.dump(conversations, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/load_tests/starcoder_load.js b/load_tests/starcoder_load.js
deleted file mode 100644
index 76316b65..00000000
--- a/load_tests/starcoder_load.js
+++ /dev/null
@@ -1,63 +0,0 @@
-import {check} from 'k6';
-import http from 'k6/http';
-import {Trend} from 'k6/metrics';
-
-const host = __ENV.HOST || '127.0.0.1:3000';
-
-const totalTime = new Trend('total_time', true);
-const validationTime = new Trend('validation_time', true);
-const queueTime = new Trend('queue_time', true);
-const inferenceTime = new Trend('inference_time', true);
-const timePerToken = new Trend('time_per_token', true);
-
-const example = {
-    payload: JSON.stringify({
-        inputs: '# This is a fibonacci function written in the Python programming language.' +
-            'def fibonacci',
-        parameters: {
-            details: true,
-            max_new_tokens: 60,
-            temperature: 0.2,
-            top_p: 0.95,
-            seed: 0,
-        },
-    }),
-    generated_tokens: 60
-};
-
-export const options = {
-    thresholds: {
-        http_req_failed: ['rate==0'],
-        time_per_token: ['p(95)<90'],
-        queue_time: ['p(95)<1500'],
-    },
-    scenarios: {
-        load_test: {
-            executor: 'constant-arrival-rate',
-            duration: '60s',
-            preAllocatedVUs: 100,
-            rate: 10,
-            timeUnit: '1s',
-        },
-    },
-};
-
-export default function () {
-    const headers = {'Content-Type': 'application/json'};
-    const res = http.post(`http://${host}/generate`, example.payload, {
-        headers,
-    });
-
-    check(res, {
-        'Post status is 200': (r) => res.status === 200,
-        'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens,
-    });
-
-    if (res.status === 200) {
-        totalTime.add(res.headers["X-Total-Time"]);
-        validationTime.add(res.headers["X-Validation-Time"]);
-        queueTime.add(res.headers["X-Queue-Time"]);
-        inferenceTime.add(res.headers["X-Inference-Time"]);
-        timePerToken.add(res.headers["X-Time-Per-Token"]);
-    }
-}
\ No newline at end of file
diff --git a/load_tests/tgi.js b/load_tests/tgi.js
deleted file mode 100644
index 93a0e278..00000000
--- a/load_tests/tgi.js
+++ /dev/null
@@ -1,17 +0,0 @@
-import { get_options, run } from "./common.js";
- 
-const reference_latency_ms = 30;
-const host = __ENV.HOST || '127.0.0.1:8000';
-const max_new_tokens = 50;
-
-
-function generate_payload(gpt){
-    const input = gpt["conversations"][0]["value"];
-    return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}}
-}
-
-export const options = get_options(reference_latency_ms);
-
-export default function(){
-    run(host, generate_payload, max_new_tokens);
-}
diff --git a/load_tests/vllm.js b/load_tests/vllm.js
deleted file mode 100644
index fcb38262..00000000
--- a/load_tests/vllm.js
+++ /dev/null
@@ -1,17 +0,0 @@
-import { get_options, run } from "./common.js";
- 
-const reference_latency_ms = 22;
-const host = __ENV.HOST || '127.0.0.1:8000';
-const max_new_tokens = 50;
-
-
-function generate_payload(gpt){
-    const input = gpt["conversations"][0]["value"];
-    return {"prompt": input, "temperature": 0.5, "ignore_eos": true}
-}
-
-export const options = get_options(reference_latency_ms);
-
-export default function(){
-    run(host, generate_payload, max_new_tokens);
-}
diff --git a/proto/generate.proto b/proto/generate.proto
index 3f607dc5..6351e37f 100644
--- a/proto/generate.proto
+++ b/proto/generate.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package generate.v1;
+package generate.v2;
 
 service TextGenerationService {
     /// Model Info
@@ -31,6 +31,8 @@ message InfoResponse {
     bool requires_padding = 1;
     string dtype = 2;
     string device_type = 3;
+    optional uint32 window_size = 4;
+    uint32 speculate = 5;
 }
 
 /// Empty request
@@ -49,6 +51,12 @@ message ClearCacheRequest {
 /// Empty response
 message ClearCacheResponse {}
 
+enum GrammarType {
+    GRAMMAR_TYPE_NONE = 0;
+    GRAMMAR_TYPE_JSON = 1;
+    GRAMMAR_TYPE_REGEX = 2;
+}
+
 message NextTokenChooserParameters {
     /// exponential scaling output probability distribution
     float temperature = 1;
@@ -64,8 +72,14 @@ message NextTokenChooserParameters {
     uint64 seed = 6;
     /// repetition penalty
     float repetition_penalty = 7;
+    /// frequency penalty
+    float frequency_penalty = 9;
     /// token watermarking using "A Watermark for Large Language Models"
     bool watermark = 8;
+    /// grammar (applied if not empty)
+    string grammar = 10;
+    /// grammar type
+    GrammarType grammar_type = 11;
 }
 
 message StoppingCriteriaParameters {
@@ -134,43 +148,27 @@ message GeneratedText {
     optional uint64 seed = 4;
 }
 
-message PrefillTokens {
-    /// Prefill Token IDs
+message Tokens {
+    /// Token IDs
     repeated uint32 ids = 1;
-    /// Prefill Logprobs
+    /// Logprobs
     repeated float logprobs = 2;
-    /// Prefill tokens
+    /// tokens
     repeated string texts = 3;
-}
-
-message TopTokens {
-    /// Top Token IDs
-    repeated uint32 ids = 1;
-    /// Top Logprobs
-    repeated float logprobs = 2;
-    /// Top Token Texts
-    repeated string texts = 3;
-    /// If the tokens are special
-    repeated bool is_special = 6;
+    /// special
+    repeated bool is_special = 4;
 }
 
 message Generation {
     /// Request ID
     uint64 request_id = 1;
     /// Prefill tokens (optional)
-    PrefillTokens prefill_tokens = 2;
-    /// Token ID
-    uint32 token_id = 3;
-    /// Logprob
-    float token_logprob = 4;
-    /// Text
-    string token_text = 5;
-    /// Is it a special token
-    bool token_is_special = 6;
+    Tokens prefill_tokens = 2;
+    Tokens tokens = 3;
     /// Complete generated text
-    optional GeneratedText generated_text = 7;
+    optional GeneratedText generated_text = 4;
     /// Top tokens
-    TopTokens top_tokens = 8;
+    repeated Tokens top_tokens = 5;
 }
 
 message FilterBatchRequest {
@@ -196,6 +194,12 @@ message PrefillResponse {
     repeated Generation generations = 1;
     /// Next batch (cached)
     optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
 }
 
 message DecodeRequest {
@@ -208,14 +212,24 @@ message DecodeResponse {
     repeated Generation generations = 1;
     /// Next batch (cached)
     optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+    /// Concatenate elapsed time in nanoseconds
+    optional uint64 concat_ns = 6;
 }
 
 message WarmupRequest {
     /// Batch to warmup on
     Batch batch = 1;
+    uint32 max_input_length = 2;
+    uint32 max_prefill_tokens = 3;
+    uint32 max_total_tokens = 4;
 }
 
-/// Empty response
 message WarmupResponse {
     /// Maximum number of tokens supported by the model
     optional uint32 max_supported_total_tokens = 1;
diff --git a/proto/v3/generate.proto b/proto/v3/generate.proto
new file mode 100644
index 00000000..926c878e
--- /dev/null
+++ b/proto/v3/generate.proto
@@ -0,0 +1,267 @@
+syntax = "proto3";
+
+package generate.v3;
+
+service TextGenerationService {
+    /// Model Info
+    rpc Info (InfoRequest) returns (InfoResponse) {}
+    /// Service discovery
+    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
+    /// Empties batch cache
+    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
+    /// Remove requests from a cached batch
+    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
+    /// Warmup the model and compute max cache size
+    rpc Warmup (WarmupRequest) returns (WarmupResponse);
+    /// Prefill batch and decode first token
+    rpc Prefill (PrefillRequest) returns (PrefillResponse);
+    /// Decode token for a list of prefilled batches
+    rpc Decode (DecodeRequest) returns (DecodeResponse);
+    /// Health check
+    rpc Health (HealthRequest) returns (HealthResponse);
+}
+
+message HealthRequest {}
+message HealthResponse {}
+
+/// Empty request
+message InfoRequest {}
+
+message InfoResponse {
+    bool requires_padding = 1;
+    string dtype = 2;
+    string device_type = 3;
+    optional uint32 window_size = 4;
+    uint32 speculate = 5;
+}
+
+/// Empty request
+message ServiceDiscoveryRequest {}
+
+message ServiceDiscoveryResponse {
+    /// Other shards urls
+    repeated string urls = 1;
+}
+
+message ClearCacheRequest {
+    /// Optional batch id
+    optional uint64 id = 1;
+}
+
+/// Empty response
+message ClearCacheResponse {}
+
+message Image {
+    /// Binary image data.
+    bytes data = 1;
+
+    /// Image MIME type.
+    string mimetype = 2;
+}
+
+message InputChunk {
+    oneof chunk {
+        /// Plain text data
+        string text = 1;
+        /// Image data
+        Image image = 2;
+    }
+}
+
+message Input {
+    repeated InputChunk chunks = 1;
+  }
+
+enum GrammarType {
+    GRAMMAR_TYPE_NONE = 0;
+    GRAMMAR_TYPE_JSON = 1;
+    GRAMMAR_TYPE_REGEX = 2;
+}
+
+message NextTokenChooserParameters {
+    /// exponential scaling output probability distribution
+    float temperature = 1;
+    /// restricting to the k highest probability elements
+    uint32 top_k = 2;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float top_p = 3;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float typical_p = 4;
+    /// apply sampling on the logits
+    bool do_sample = 5;
+    /// random seed for sampling
+    uint64 seed = 6;
+    /// repetition penalty
+    float repetition_penalty = 7;
+    /// frequency penalty
+    float frequency_penalty = 9;
+    /// token watermarking using "A Watermark for Large Language Models"
+    bool watermark = 8;
+    /// grammar (applied if not empty)
+    string grammar = 10;
+    /// grammar type
+    GrammarType grammar_type = 11;
+}
+
+message StoppingCriteriaParameters {
+    /// Maximum number of generated tokens
+    uint32 max_new_tokens = 1;
+    /// Optional stopping sequences
+    repeated string stop_sequences = 2;
+    /// Ignore end of sequence token
+    /// used for benchmarking
+    bool ignore_eos_token = 3;
+}
+
+message Request {
+    /// Request ID
+    uint64 id = 1;
+    /// The generation context as chunks
+    Input input_chunks = 8;
+    /// The generation context, stringified input_chunks
+    string inputs = 2;
+    /// Context truncation
+    uint32 truncate = 3;
+    /// Next Token Chooser Parameters
+    NextTokenChooserParameters parameters = 4;
+    /// Stopping Criteria Parameters
+    StoppingCriteriaParameters stopping_parameters = 5;
+    /// Return prefill logprobs
+    bool prefill_logprobs = 6;
+    /// Return most likely n tokens
+    uint32 top_n_tokens = 7;
+    /// Paged attention blocks
+    repeated uint32 blocks = 9;
+    /// Paged attention slots
+    repeated uint32  slots = 10;
+    /// LORA adapter index
+    optional string adapter_id = 11;
+}
+
+message Batch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests
+    repeated Request requests = 2;
+    /// Batch size (==len(requests))
+    uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
+    /// Maximum number of Paged Attention blocks
+    uint32 max_blocks = 5;
+}
+
+message CachedBatch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests ids
+    repeated uint64 request_ids = 2;
+    /// Batch size (==len(requests))
+    uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
+}
+
+enum FinishReason {
+    FINISH_REASON_LENGTH = 0;
+    FINISH_REASON_EOS_TOKEN = 1;
+    FINISH_REASON_STOP_SEQUENCE = 2;
+}
+
+message GeneratedText {
+    /// Output
+    string text = 1;
+    /// Number of generated tokens
+    uint32 generated_tokens = 2;
+    /// Finish reason
+    FinishReason finish_reason = 3;
+    /// Seed
+    optional uint64 seed = 4;
+}
+
+message Tokens {
+    /// Token IDs
+    repeated uint32 ids = 1;
+    /// Logprobs
+    repeated float logprobs = 2;
+    /// tokens
+    repeated string texts = 3;
+    /// special
+    repeated bool is_special = 4;
+}
+
+message Generation {
+    /// Request ID
+    uint64 request_id = 1;
+    /// Prefill tokens (optional)
+    Tokens prefill_tokens = 2;
+    Tokens tokens = 3;
+    /// Complete generated text
+    optional GeneratedText generated_text = 4;
+    /// Top tokens
+    repeated Tokens top_tokens = 5;
+}
+
+message FilterBatchRequest {
+    /// Batch ID
+    uint64 batch_id = 1;
+    /// Requests to keep
+    repeated uint64 request_ids = 2;
+}
+
+message FilterBatchResponse {
+    /// Filtered Batch (cached)
+    CachedBatch batch = 1;
+}
+
+
+message PrefillRequest {
+    /// Batch
+    Batch batch = 1;
+}
+
+message PrefillResponse {
+    /// Generation
+    repeated Generation generations = 1;
+    /// Next batch (cached)
+    optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+}
+
+message DecodeRequest {
+    /// Cached batches
+    repeated CachedBatch batches = 1;
+}
+
+message DecodeResponse {
+    /// Decodes
+    repeated Generation generations = 1;
+    /// Next batch (cached)
+    optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+    /// Concatenate elapsed time in nanoseconds
+    optional uint64 concat_ns = 6;
+}
+
+message WarmupRequest {
+    /// Batch to warmup on
+    Batch batch = 1;
+    uint32 max_input_length = 2;
+    uint32 max_prefill_tokens = 3;
+    uint32 max_total_tokens = 4;
+}
+
+message WarmupResponse {
+    /// Maximum number of tokens supported by the model
+    optional uint32 max_supported_total_tokens = 1;
+}
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 10396826..5855ac86 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -15,36 +15,49 @@ name = "text-generation-router"
 path = "src/main.rs"
 
 [dependencies]
-async-stream = "0.3.3"
-axum = { version = "0.6.4", features = ["json"] }
-axum-tracing-opentelemetry = "0.10.0"
+async-stream = "0.3.5"
+axum = { version = "0.7", features = ["json"] }
+axum-tracing-opentelemetry = "0.16"
 text-generation-client = { path = "client" }
-clap = { version = "4.1.4", features = ["derive", "env"] }
-flume = "0.10.14"
-futures = "0.3.26"
-metrics = "0.21.0"
-metrics-exporter-prometheus = { version = "0.12.1", features = [] }
+clap = { version = "4.4.5", features = ["derive", "env"] }
+futures = "0.3.28"
+hf-hub = { workspace = true }
+itertools = "0.10"
+jsonschema = { version = "0.17.1", features = ["draft202012"] }
+metrics = "0.21.1"
+metrics-exporter-prometheus = { version = "0.15.1", features = [] }
 nohash-hasher = "0.2.0"
-opentelemetry = { version = "0.19.0", features = ["rt-tokio"] }
-opentelemetry-otlp = "0.12.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
 rand = "0.8.5"
-reqwest = { version = "0.11.14", features = [] }
-serde = "1.0.152"
-serde_json = "1.0.93"
-thiserror = "1.0.38"
-tokenizers = "0.13.3"
-tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
-tower-http = { version = "0.4.0", features = ["cors"] }
-tracing = "0.1.37"
-tracing-opentelemetry = "0.19.0"
-tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
-utoipa = { version = "3.0.1", features = ["axum_extras"] }
-utoipa-swagger-ui = { version = "3.0.2", features = ["axum"] }
-ngrok = { version = "0.12.3", features = ["axum"], optional = true }
+reqwest = { version = "0.11.20", features = [] }
+serde = "1.0.188"
+serde_json = "1.0.107"
+thiserror = "1.0.48"
+tokenizers = { workspace = true}
+tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.14"
+tower-http = { version = "0.5.1", features = ["cors"] }
+tracing = "0.1.40"
+tracing-opentelemetry = "0.21.0"
+tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
+utoipa = { version = "4.2.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
+ngrok = { version = "0.13.1", features = ["axum"], optional = true }
+init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
+minijinja = { version = "2.0.2" }
+minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
+futures-util = "0.3.30"
+regex = "1.10.3"
+once_cell = "1.19.0"
+image = "0.25.1"
+base64 = { workspace = true }
 
 [build-dependencies]
-vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
+vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }
 
 [features]
 default = ["ngrok"]
-ngrok = ["dep:ngrok"]
\ No newline at end of file
+ngrok = ["dep:ngrok"]
+google = []
+kserve = []
diff --git a/router/README.md b/router/README.md
index c18d4f9e..5b1f9e36 100644
--- a/router/README.md
+++ b/router/README.md
@@ -28,7 +28,7 @@ this is controlled by the client, and therefore the amount of batching is decide
 beforehand.
 
 For text-generation, and LLMs which are memory bound we can try to be much more
-efficient with the available compute, by having client sending us single queries, 
+efficient with the available compute, by having client sending us single queries,
 and let the router mix&match queries into or out of batches to make the use the
 compute the most efficiently. This is possible because for LLMs the total compute
 for running the model is much bigger than doing mix&match of the batches themselves.
@@ -89,5 +89,5 @@ most critical perceived quality of an LLM API.
 With token streaming, the server can start answering after the first `prefill` pass
 directly, without waiting for all the generation to be done. For extremely long queries
 this means clients can start to see something happening orders of magnitude before
-the work is done. Seeing something in progress allows them to cut short if it's not 
+the work is done. Seeing something in progress allows them to cut short if it's not
 what's wanted but also it "feels" better.
diff --git a/router/client/Cargo.toml b/router/client/Cargo.toml
index 43f444e6..db423c4b 100644
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@@ -6,15 +6,17 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
+async-trait = "^0.1"
+base64 = { workspace = true }
 futures = "^0.3"
 grpc-metadata = { path = "../grpc-metadata" }
-prost = "^0.11"
+prost = "^0.12"
 thiserror = "^1.0"
-tokio = { version = "^1.25", features = ["sync"] }
-tonic = "^0.9"
+tokio = { version = "^1.32", features = ["sync"] }
+tonic = "^0.10"
 tower = "^0.4"
 tracing = "^0.1"
 
 [build-dependencies]
-tonic-build = "0.9.2"
-prost-build = "0.11.6"
+tonic-build = "0.10.1"
+prost-build = "0.12.1"
diff --git a/router/client/build.rs b/router/client/build.rs
index 497be545..210cd603 100644
--- a/router/client/build.rs
+++ b/router/client/build.rs
@@ -1,18 +1,34 @@
 use std::fs;
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("cargo:rerun-if-changed=../../proto/generate.proto");
-    fs::create_dir("src/pb").unwrap_or(());
+    println!("cargo:rerun-if-changed=../../proto/");
 
+    fs::create_dir_all("src/v2/pb").unwrap_or(());
     let mut config = prost_build::Config::new();
     config.protoc_arg("--experimental_allow_proto3_optional");
 
     tonic_build::configure()
         .build_client(true)
         .build_server(false)
-        .out_dir("src/pb")
+        .out_dir("src/v2/pb")
         .include_file("mod.rs")
         .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
+        .map_err(|e| match e.kind(){
+            std::io::ErrorKind::NotFound => {panic!("`protoc` not found, install libprotoc")},
+            std::io::ErrorKind::Other => {panic!("`protoc` version unsupported, upgrade protoc: https://github.com/protocolbuffers/protobuf/releases")},
+            e => {e}
+        }).unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
+    fs::create_dir_all("src/v3/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/v3/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
         .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
 
     Ok(())
diff --git a/router/client/src/lib.rs b/router/client/src/lib.rs
index f334be21..45bee10c 100644
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@@ -1,22 +1,35 @@
 //! Text Generation gRPC client library
 
-mod client;
-#[allow(clippy::derive_partial_eq_without_eq)]
-mod pb;
-mod sharded_client;
+pub mod v2;
+pub mod v3;
 
-pub use client::Client;
-pub use pb::generate::v1::HealthResponse;
-pub use pb::generate::v1::InfoResponse as ShardInfo;
-pub use pb::generate::v1::{
-    Batch, CachedBatch, FinishReason, GeneratedText, Generation, NextTokenChooserParameters,
-    PrefillTokens, Request, StoppingCriteriaParameters,
-};
-pub use sharded_client::ShardedClient;
+use async_trait::async_trait;
+use base64::{engine::general_purpose::STANDARD, Engine};
 use thiserror::Error;
 use tonic::transport;
 use tonic::Status;
 
+pub use v3::{Chunk, Image, Input, InputChunk};
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
+
 #[derive(Error, Debug, Clone)]
 pub enum ClientError {
     #[error("Could not connect to Text Generation server: {0}")]
@@ -43,4 +56,36 @@ impl From<transport::Error> for ClientError {
     }
 }
 
+// Small convenience re-wrapping of `Chunk`.
+impl From<Chunk> for InputChunk {
+    fn from(chunk: Chunk) -> Self {
+        InputChunk { chunk: Some(chunk) }
+    }
+}
+
+/// Convert input chunks to a stringly-typed input for backwards
+/// compat for backends that haven't implemented chunked inputs.
+pub trait ChunksToString {
+    /// Convert chunks to string.
+    fn chunks_to_string(&self) -> String;
+}
+
+impl ChunksToString for Vec<InputChunk> {
+    fn chunks_to_string(&self) -> String {
+        let mut output = String::new();
+        self.iter().for_each(|c| match &c.chunk {
+            Some(Chunk::Text(text)) => output.push_str(text),
+            Some(Chunk::Image(Image { data, mimetype })) => {
+                let encoded = STANDARD.encode(data);
+                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
+            }
+            // We don't create empty chunks, so this should be unreachable.
+            None => unreachable!("Chunks should never be empty"),
+        });
+        output
+    }
+}
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
 pub type Result<T> = std::result::Result<T, ClientError>;
diff --git a/router/client/src/pb/.gitignore b/router/client/src/pb/.gitignore
deleted file mode 100644
index b46a4c42..00000000
--- a/router/client/src/pb/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.rs
\ No newline at end of file
diff --git a/router/client/src/client.rs b/router/client/src/v2/client.rs
similarity index 64%
rename from router/client/src/client.rs
rename to router/client/src/v2/client.rs
index d427d3a4..9a2e6ac7 100644
--- a/router/client/src/client.rs
+++ b/router/client/src/v2/client.rs
@@ -1,9 +1,13 @@
 /// Single shard Client
-use crate::pb::generate::v1::text_generation_service_client::TextGenerationServiceClient;
-use crate::pb::generate::v1::*;
-use crate::Result;
+use crate::v2::pb;
+use crate::{ClientError, Result};
+
+use crate::WARMUP_IMAGE_BASE64;
 use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v2::*;
 use std::cmp::min;
+use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;
 
@@ -41,7 +45,9 @@ impl Client {
     #[instrument(skip(self))]
     pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
         let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
-        let response = self.stub.service_discovery(request).await?;
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v2 interface".to_string())
+        })?;
         let urls = response
             .into_inner()
             .urls
@@ -103,17 +109,30 @@ impl Client {
         &mut self,
         max_input_length: u32,
         max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
     ) -> Result<Option<u32>> {
         let mut n_tokens = 0;
         let mut requests = Vec::new();
-
         // Create requests
         while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
             requests.push(Request {
                 id: 0,
+                inputs,
                 // We truncate the input on the server side to be sure that it has the correct size
-                inputs: "_test ".to_string().repeat(max_input_length as usize),
-                truncate: min(max_input_length, max_prefill_tokens - n_tokens),
+                truncate,
                 // Set sampling parameters to also take these ops into account in the max memory
                 parameters: Some(NextTokenChooserParameters {
                     temperature: 0.9,
@@ -123,17 +142,25 @@ impl Client {
                     do_sample: false,
                     seed: 0,
                     repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
                     watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
                 }),
                 stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: 2,
+                    max_new_tokens: max_total_tokens - truncate,
                     stop_sequences: vec![],
-                    ignore_eos_token: false,
+                    ignore_eos_token: true,
                 }),
                 prefill_logprobs: true,
                 top_n_tokens: 20,
             });
             n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
         }
 
         let batch = Batch {
@@ -143,7 +170,13 @@ impl Client {
             max_tokens: 0,
         };
 
-        let request = tonic::Request::new(WarmupRequest { batch: Some(batch) }).inject_context();
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
         let response = self.stub.warmup(request).await?.into_inner();
         Ok(response.max_supported_total_tokens)
     }
@@ -156,10 +189,14 @@ impl Client {
     pub async fn prefill(
         &mut self,
         batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
         let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
         let response = self.stub.prefill(request).await?.into_inner();
-        Ok((response.generations, response.batch))
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
     }
 
     /// Generate one token for each request in the given cached batches
@@ -170,9 +207,52 @@ impl Client {
     pub async fn decode(
         &mut self,
         batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
         let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
         let response = self.stub.decode(request).await?.into_inner();
-        Ok((response.generations, response.batch))
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
     }
 }
diff --git a/router/client/src/v2/mod.rs b/router/client/src/v2/mod.rs
new file mode 100644
index 00000000..6b14b9f3
--- /dev/null
+++ b/router/client/src/v2/mod.rs
@@ -0,0 +1,13 @@
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod client;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v2::HealthResponse;
+pub use pb::generate::v2::{
+    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, InfoResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
diff --git a/router/client/src/v2/pb/.gitignore b/router/client/src/v2/pb/.gitignore
new file mode 100644
index 00000000..72e8ffc0
--- /dev/null
+++ b/router/client/src/v2/pb/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/router/client/src/sharded_client.rs b/router/client/src/v2/sharded_client.rs
similarity index 53%
rename from router/client/src/sharded_client.rs
rename to router/client/src/v2/sharded_client.rs
index 112b0035..7b24aec3 100644
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/v2/sharded_client.rs
@@ -1,9 +1,17 @@
 /// Multi shard Client
-use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
+use crate::{v2, Health, ShardInfo};
 use crate::{ClientError, Result};
+
+use crate::v2::InfoResponse;
+use async_trait::async_trait;
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
+use v2::client::{DecodeTimings, PrefillTimings};
+use v2::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
 
 #[derive(Debug, Clone)]
 /// Text Generation Inference gRPC multi client
@@ -46,7 +54,7 @@ impl ShardedClient {
             .iter_mut()
             .map(|client| client.info())
             .collect();
-        join_all(futures).await.pop().unwrap()
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
     }
 
     /// GRPC health check
@@ -95,11 +103,20 @@ impl ShardedClient {
         &mut self,
         max_input_length: u32,
         max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
     ) -> Result<Option<u32>> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
-            .map(|client| Box::pin(client.warmup(max_input_length, max_prefill_tokens)))
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
             .collect();
         // Take the minimum value
         let results = join_all(futures)
@@ -113,49 +130,122 @@ impl ShardedClient {
     ///
     /// Returns Generation for each request in batch
     /// and the next cached batch
-    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
     pub async fn prefill(
         &mut self,
         batch: Batch,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
             .map(|client| Box::pin(client.prefill(batch.clone())))
             .collect();
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
             join_all(futures).await.into_iter().collect();
-        merge_generations(results?)
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
     }
 
     /// Generate one token for each request in the given cached batches
     ///
     /// Returns Generation for each request in batches
     /// and the next cached batch
-    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
     pub async fn decode(
         &mut self,
         batches: Vec<CachedBatch>,
-    ) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
             .map(|client| Box::pin(client.decode(batches.clone())))
             .collect();
-        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>)>> =
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
             join_all(futures).await.into_iter().collect();
-        merge_generations(results?)
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
     }
 }
 
-/// Merge generations from the different model shards
-fn merge_generations(
-    mut results: Vec<(Vec<Generation>, Option<CachedBatch>)>,
-) -> Result<(Vec<Generation>, Option<CachedBatch>)> {
-    let (mut generations, next_batch) = results.pop().ok_or(ClientError::EmptyResults)?;
-
-    for (mut shard_generations, _) in results.into_iter() {
-        generations.append(&mut shard_generations);
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
     }
-    Ok((generations, next_batch))
 }
diff --git a/router/client/src/v3/client.rs b/router/client/src/v3/client.rs
new file mode 100644
index 00000000..a996b14f
--- /dev/null
+++ b/router/client/src/v3/client.rs
@@ -0,0 +1,283 @@
+use crate::v3::{pb, Chunk};
+use crate::{ClientError, Result, WARMUP_IMAGE_BASE64};
+/// Single shard Client
+use base64::engine::general_purpose::STANDARD;
+use base64::Engine;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut input_chunks = Vec::new();
+            input_chunks
+                .push(Chunk::Text("_test ".to_string().repeat(max_input_length as usize)).into());
+            if n_tokens == 0 {
+                input_chunks.push(
+                    Chunk::Image(Image {
+                        // Safe unwrap, because we control the data.
+                        data: STANDARD.decode(WARMUP_IMAGE_BASE64).unwrap(),
+                        mimetype: "image/jpeg;base64".to_string(),
+                    })
+                    .into(),
+                );
+            }
+
+            // Send stringly-typed inputs for compatibility for backends that haven't
+            // been updated to support chunks.
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                input_chunks: Some(Input {
+                    chunks: input_chunks,
+                }),
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Blocks and slots will be set on the server side if we use paged attention
+                blocks: vec![],
+                slots: vec![],
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+                adapter_id: None,
+            });
+            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: max_input_length,
+            max_blocks: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
diff --git a/router/client/src/v3/mod.rs b/router/client/src/v3/mod.rs
new file mode 100644
index 00000000..4a1296a2
--- /dev/null
+++ b/router/client/src/v3/mod.rs
@@ -0,0 +1,13 @@
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod client;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
diff --git a/router/client/src/v3/pb/.gitignore b/router/client/src/v3/pb/.gitignore
new file mode 100644
index 00000000..72e8ffc0
--- /dev/null
+++ b/router/client/src/v3/pb/.gitignore
@@ -0,0 +1 @@
+*
diff --git a/router/client/src/v3/sharded_client.rs b/router/client/src/v3/sharded_client.rs
new file mode 100644
index 00000000..ae8a899b
--- /dev/null
+++ b/router/client/src/v3/sharded_client.rs
@@ -0,0 +1,259 @@
+/// Multi shard Client
+use crate::{v3, Health, ShardInfo};
+use crate::{ClientError, Result};
+
+use crate::v3::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+use v3::client::{DecodeTimings, PrefillTimings};
+use v3::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+            // Block 0 is reserved for health checks
+            blocks: vec![0],
+            slots: (0..16).collect(),
+            adapter_id: None,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+            max_blocks: 1,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
diff --git a/router/grpc-metadata/Cargo.toml b/router/grpc-metadata/Cargo.toml
index 9e01f527..da163ec5 100644
--- a/router/grpc-metadata/Cargo.toml
+++ b/router/grpc-metadata/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-opentelemetry = "^0.19"
-tonic = "^0.9"
+opentelemetry = "^0.20"
+tonic = "^0.10"
 tracing = "^0.1"
-tracing-opentelemetry = "^0.19"
+tracing-opentelemetry = "^0.21"
diff --git a/router/grpc-metadata/src/lib.rs b/router/grpc-metadata/src/lib.rs
index 7ba353fa..3068a61c 100644
--- a/router/grpc-metadata/src/lib.rs
+++ b/router/grpc-metadata/src/lib.rs
@@ -2,30 +2,9 @@
 //! Inspired by: https://github.com/open-telemetry/opentelemetry-rust gRPC examples
 
 use opentelemetry::global;
-use opentelemetry::propagation::{Extractor, Injector};
+use opentelemetry::propagation::Injector;
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 
-/// Extract context metadata from a gRPC request's metadata
-struct MetadataExtractor<'a>(pub &'a tonic::metadata::MetadataMap);
-
-impl<'a> Extractor for MetadataExtractor<'a> {
-    /// Get a value for a key from the MetadataMap.  If the value can't be converted to &str, returns None
-    fn get(&self, key: &str) -> Option<&str> {
-        self.0.get(key).and_then(|metadata| metadata.to_str().ok())
-    }
-
-    /// Collect all the keys from the MetadataMap.
-    fn keys(&self) -> Vec<&str> {
-        self.0
-            .keys()
-            .map(|key| match key {
-                tonic::metadata::KeyRef::Ascii(v) => v.as_str(),
-                tonic::metadata::KeyRef::Binary(v) => v.as_str(),
-            })
-            .collect::<Vec<_>>()
-    }
-}
-
 /// Inject context in the metadata of a gRPC request.
 struct MetadataInjector<'a>(pub &'a mut tonic::metadata::MetadataMap);
 
diff --git a/router/src/config.rs b/router/src/config.rs
new file mode 100644
index 00000000..7737165e
--- /dev/null
+++ b/router/src/config.rs
@@ -0,0 +1,221 @@
+use serde::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "model_type")]
+#[serde(rename_all = "snake_case")]
+pub struct LlavaNext {
+    pub(crate) text_config: TextConfig,
+    pub(crate) vision_config: VisionConfig,
+    pub(crate) image_grid_pinpoints: Vec<(usize, usize)>,
+}
+
+fn get_anyres_image_grid_shape(
+    height: usize,
+    width: usize,
+    grid_pinpoints: &[(usize, usize)],
+    patch_size: usize,
+) -> (usize, usize) {
+    let (height, width) = select_best_resolution(height, width, grid_pinpoints);
+    (height / patch_size, width / patch_size)
+}
+
+/// Selects the best resolution from a list of possible resolutions based on the original size.
+/// This is done by calculating the effective and wasted resolution for each possible resolution.
+/// The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+fn select_best_resolution(
+    original_height: usize,
+    original_width: usize,
+    possible_resolutions: &[(usize, usize)],
+) -> (usize, usize) {
+    let mut best_fit = None;
+    let mut max_effective_resolution = 0;
+    let mut min_wasted_resolution = f32::NEG_INFINITY;
+
+    for (height, width) in possible_resolutions {
+        let wscale = *width as f32 / original_width as f32;
+        let hscale = *height as f32 / original_height as f32;
+        // f32 partial ord.
+        let scale = if wscale > hscale { hscale } else { wscale };
+        let downscaled_width = (*width as f32 * scale) as usize;
+        let downscaled_height = (*height as f32 * scale) as usize;
+        let effective_resolution = std::cmp::min(
+            downscaled_width * downscaled_height,
+            original_width * original_height,
+        );
+        let wasted_resolution = (width * height) - effective_resolution;
+
+        if effective_resolution > max_effective_resolution
+            || (effective_resolution == max_effective_resolution
+                && (wasted_resolution as f32) < min_wasted_resolution)
+        {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution as f32;
+            best_fit = Some((*height, *width));
+        }
+    }
+
+    best_fit.unwrap_or((original_height, original_width))
+}
+
+fn get_unpadded_features(
+    height: usize,
+    width: usize,
+    npatches: usize,
+    num_patch_height: usize,
+    num_patch_width: usize,
+) -> (usize, usize) {
+    let current_height = npatches * num_patch_height;
+    let current_width = npatches * num_patch_width;
+
+    let aspect_ratio: f64 = width as f64 / height as f64;
+    let current_aspect_ratio: f64 = current_width as f64 / current_height as f64;
+    let (current_height, current_width) = if aspect_ratio > current_aspect_ratio {
+        let new_height = (height * current_width) / width;
+        let padding = (current_height - new_height) / 2;
+        (current_height - (2 * padding), current_width)
+    } else {
+        let new_width = (width * current_height) / height;
+        let padding = (current_width - new_width) / 2;
+        (current_height, current_width - (2 * padding))
+    };
+
+    let unpadded_features = current_height * current_width;
+    let newline_features = current_height;
+    (unpadded_features, newline_features)
+}
+
+impl LlavaNext {
+    pub fn get_number_of_features(&self, height: usize, width: usize) -> usize {
+        let image_size = self.vision_config.image_size;
+        let patch_size = self.vision_config.patch_size;
+        assert!(image_size % patch_size == 0);
+        let npatches = image_size / patch_size;
+        // Dimensions are intentionally swapped to be bug-compatible with
+        // upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
+        let (num_patch_width, num_patch_height) =
+            get_anyres_image_grid_shape(height, width, &self.image_grid_pinpoints, image_size);
+
+        let (unpadded_features, newline_features) =
+            get_unpadded_features(height, width, npatches, num_patch_height, num_patch_width);
+        // The base patch covers the entire image
+        let base_features = npatches.pow(2);
+        unpadded_features + newline_features + base_features
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ClipVisionModel {
+    image_size: usize,
+    patch_size: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Idefics2 {}
+
+impl Idefics2 {
+    pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize {
+        64
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct PaliTextConfig {
+    pub(crate) num_image_tokens: usize,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Paligemma {
+    pub(crate) text_config: PaliTextConfig,
+}
+
+impl Paligemma {
+    pub fn get_number_of_features(&self, _height: usize, _width: usize) -> usize {
+        self.text_config.num_image_tokens
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "model_type")]
+#[serde(rename_all = "snake_case")]
+pub enum Config {
+    LlavaNext(LlavaNext),
+    ClipVisionModel(ClipVisionModel),
+    Mistral,
+    Idefics,
+    Idefics2(Idefics2),
+    Ssm,
+    GptBigcode,
+    Santacoder,
+    Bloom,
+    Mpt,
+    Gpt2,
+    GptNeox,
+    Phi,
+    #[serde(rename = "phi-msft")]
+    PhiMsft,
+    Phi3,
+    Llama,
+    Baichuan,
+    Paligemma(Paligemma),
+    Gemma,
+    Gemma2,
+    Cohere,
+    Drbx,
+    Falcon,
+    Mixtral,
+    Starcoder2,
+    Qwen2,
+    Opt,
+    T5,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct TextConfig {}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct VisionConfig {
+    pub(crate) image_size: usize,
+    pub(crate) patch_size: usize,
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_llava_next_features() {
+        let config = LlavaNext {
+            text_config: TextConfig {},
+            vision_config: VisionConfig {
+                image_size: 336,
+                patch_size: 14,
+            },
+            image_grid_pinpoints: vec![
+                (336, 672),
+                (672, 336),
+                (672, 672),
+                (1008, 336),
+                (336, 1008),
+            ],
+        };
+
+        let slots = config.get_number_of_features(20, 20);
+        assert_eq!(slots, 1176);
+        let slots = config.get_number_of_features(640, 640);
+        assert_eq!(slots, 2928);
+        let slots = config.get_number_of_features(480, 640);
+        assert_eq!(slots, 2340);
+        let slots = config.get_number_of_features(899, 1024);
+        assert_eq!(slots, 2634);
+        let slots = config.get_number_of_features(1024, 899);
+        assert_eq!(slots, 2640);
+        let slots = config.get_number_of_features(1067, 1600);
+        assert_eq!(slots, 2144);
+    }
+}
diff --git a/router/src/health.rs b/router/src/health.rs
deleted file mode 100644
index ab290fc1..00000000
--- a/router/src/health.rs
+++ /dev/null
@@ -1,68 +0,0 @@
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::{
-    Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
-};
-
-// Note: Request ids and batch ids cannot collide.
-const LIVENESS_ID: u64 = u64::MAX;
-const BATCH_ID: u64 = u64::MAX;
-
-#[derive(Clone, Debug)]
-pub(crate) struct Health {
-    client: ShardedClient,
-    generation_health: Arc<AtomicBool>,
-}
-
-impl Health {
-    pub(crate) fn new(client: ShardedClient, generation_health: Arc<AtomicBool>) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-
-    pub(crate) async fn check(&mut self) -> bool {
-        if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards are answering gRPC calls
-            self.client.health().await.is_ok()
-        } else {
-            // Generation is unhealthy or have not sent any generation request yet
-
-            // Dummy batch of 1 token and 1 generated token
-            let liveness_request = Request {
-                id: LIVENESS_ID,
-                inputs: "liveness".to_string(),
-                truncate: 10,
-                prefill_logprobs: false,
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 1.0,
-                    top_k: 0,
-                    top_p: 1.0,
-                    typical_p: 1.0,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.0,
-                    watermark: false,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: 1,
-                    stop_sequences: vec![],
-                    ignore_eos_token: false,
-                }),
-                top_n_tokens: 0,
-            };
-            let batch = Batch {
-                id: BATCH_ID,
-                requests: vec![liveness_request],
-                size: 1,
-                max_tokens: 2,
-            };
-            // Skips the queue
-            let value = self.client.prefill(batch).await.is_ok();
-            // Update generation health
-            self.generation_health.store(value, Ordering::SeqCst);
-            value
-        }
-    }
-}
diff --git a/router/src/infer.rs b/router/src/infer.rs
deleted file mode 100644
index 67b5bde2..00000000
--- a/router/src/infer.rs
+++ /dev/null
@@ -1,648 +0,0 @@
-/// Batching and inference logic
-use crate::validation::{Validation, ValidationError};
-use crate::{Entry, Queue, Token};
-use crate::{GenerateRequest, PrefillToken};
-use flume::r#async::RecvStream;
-use flume::SendTimeoutError;
-use futures::future::try_join_all;
-use futures::stream::StreamExt;
-use nohash_hasher::IntMap;
-use std::sync::{
-    atomic::{AtomicBool, Ordering},
-    Arc,
-};
-use std::time::Duration;
-use text_generation_client::{
-    Batch, CachedBatch, ClientError, GeneratedText, Generation, PrefillTokens, ShardedClient,
-};
-use thiserror::Error;
-use tokio::sync::{Notify, OwnedSemaphorePermit, Semaphore, TryAcquireError};
-use tokio::time::Instant;
-use tracing::{info_span, instrument, Instrument, Span};
-
-/// Inference struct
-#[derive(Clone)]
-pub struct Infer {
-    /// Validation
-    validation: Validation,
-    /// Request queue
-    queue: Queue,
-    /// Shared state
-    shared: Arc<Shared>,
-    /// Inference limit
-    limit_concurrent_requests: Arc<Semaphore>,
-}
-
-/// Infer shared state
-struct Shared {
-    /// Batching background Tokio task notifier
-    batching_task: Notify,
-}
-
-impl Infer {
-    #[allow(clippy::too_many_arguments)]
-    pub(crate) fn new(
-        client: ShardedClient,
-        validation: Validation,
-        waiting_served_ratio: f32,
-        max_batch_prefill_tokens: u32,
-        max_batch_total_tokens: u32,
-        max_waiting_tokens: usize,
-        max_concurrent_requests: usize,
-        requires_padding: bool,
-        generation_health: Arc<AtomicBool>,
-    ) -> Self {
-        // Infer shared state
-        let queue = Queue::new(requires_padding, 16);
-        let shared = Arc::new(Shared {
-            batching_task: Notify::new(),
-        });
-
-        // Spawn batching background task that contains all the inference logic
-        tokio::spawn(batching_task(
-            client,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            queue.clone(),
-            shared.clone(),
-            generation_health,
-        ));
-
-        // Inference limit with a semaphore
-        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
-
-        Self {
-            validation,
-            queue,
-            shared,
-            limit_concurrent_requests: semaphore,
-        }
-    }
-
-    /// Add a new request to the queue and return a stream of InferStreamResponse
-    #[instrument(skip(self))]
-    pub(crate) async fn generate_stream(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<
-        (
-            OwnedSemaphorePermit,
-            RecvStream<Result<InferStreamResponse, InferError>>,
-        ),
-        InferError,
-    > {
-        // Limit concurrent requests by acquiring a permit from the semaphore
-        let permit = self
-            .clone()
-            .limit_concurrent_requests
-            .try_acquire_owned()
-            .map_err(|err| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
-                tracing::error!("{err}");
-                err
-            })?;
-
-        // Validate request
-        let valid_request = self.validation.validate(request).await.map_err(|err| {
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
-            tracing::error!("{err}");
-            err
-        })?;
-
-        // MPSC channel to communicate with the background batching task
-        let (response_tx, response_rx) = flume::unbounded();
-
-        // Append the request to the queue
-        self.queue.append(Entry {
-            request: valid_request,
-            response_tx,
-            span: Span::current(),
-            temp_span: None,
-            queue_time: Instant::now(),
-            batch_time: None,
-        });
-
-        // Notify the background task that we have a new entry in the queue that needs
-        // to be batched
-        self.shared.batching_task.notify_one();
-
-        // Return stream
-        Ok((permit, response_rx.into_stream()))
-    }
-
-    /// Add a new request to the queue and return a InferResponse
-    #[instrument(skip(self))]
-    pub(crate) async fn generate(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<InferResponse, InferError> {
-        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);
-
-        // Create stream and keep semaphore permit as long as generate lives
-        let (_permit, mut stream) = self.generate_stream(request).await?;
-
-        // Return values
-        let mut result_prefill = Vec::new();
-        let mut result_tokens = Vec::new();
-        let mut result_top_tokens = Vec::new();
-        let mut result_generated_text = None;
-        let mut result_start = None;
-        let mut result_queued = None;
-
-        // Iterate on stream
-        while let Some(response) = stream.next().await {
-            match response? {
-                // Add prefill tokens
-                InferStreamResponse::Prefill(tokens) => {
-                    // Create Token objects
-                    // We do that here instead of in the Python code as Rust for loops are faster
-                    result_prefill = tokens
-                        .ids
-                        .into_iter()
-                        .zip(tokens.logprobs.into_iter())
-                        .zip(tokens.texts.into_iter())
-                        .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
-                        .collect();
-                }
-                // Push last token
-                InferStreamResponse::Intermediate { token, top_tokens } => {
-                    result_tokens.push(token);
-                    result_top_tokens.push(top_tokens);
-                }
-                // Final message
-                // Set return values
-                InferStreamResponse::End {
-                    token,
-                    generated_text,
-                    start,
-                    queued,
-                    top_tokens,
-                } => {
-                    result_tokens.push(token);
-                    result_top_tokens.push(top_tokens);
-                    result_generated_text = Some(generated_text);
-                    result_start = Some(start);
-                    result_queued = Some(queued)
-                }
-            }
-        }
-
-        // Check that we received a `InferStreamResponse::End` message
-        if let (Some(generated_text), Some(queued), Some(start)) =
-            (result_generated_text, result_queued, result_start)
-        {
-            Ok(InferResponse {
-                prefill: result_prefill,
-                tokens: result_tokens,
-                generated_text,
-                queued,
-                start,
-                top_tokens: if use_top_tokens {
-                    result_top_tokens
-                } else {
-                    Vec::new()
-                },
-            })
-        } else {
-            let err = InferError::IncompleteGeneration;
-            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
-            tracing::error!("{err}");
-            Err(err)
-        }
-    }
-    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
-    /// the highest log probability per token
-    #[instrument(skip(self))]
-    pub(crate) async fn generate_best_of(
-        &self,
-        request: GenerateRequest,
-        best_of: usize,
-    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
-        // validate  best_of parameter separately
-        let best_of = self.validation.validate_best_of(best_of)?;
-
-        // create multiple generate requests
-        let mut infer_responses: Vec<InferResponse> =
-            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
-
-        // get the sequence with the highest log probability per token
-        let mut max_index = 0;
-        let mut max_logprob: f32 = f32::MIN;
-
-        for (i, response) in infer_responses.iter().enumerate() {
-            // mean logprobs of the generated tokens
-            let sequence_logprob = response
-                .tokens
-                .iter()
-                .map(|token| token.logprob)
-                .sum::<f32>()
-                / response.tokens.len() as f32;
-
-            // set best sequence
-            if sequence_logprob > max_logprob {
-                max_index = i;
-                max_logprob = sequence_logprob;
-            }
-        }
-        let best_response = infer_responses.remove(max_index);
-        Ok((best_response, infer_responses))
-    }
-}
-
-/// Batching logic
-/// Will be launched in a background Tokio task
-///
-/// Batches requests and sends them to the inference server
-#[allow(clippy::too_many_arguments)]
-async fn batching_task(
-    mut client: ShardedClient,
-    waiting_served_ratio: f32,
-    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
-    max_waiting_tokens: usize,
-    queue: Queue,
-    shared: Arc<Shared>,
-    generation_health: Arc<AtomicBool>,
-) {
-    // Infinite loop
-    loop {
-        // Wait for a notification from the Infer struct
-        shared.batching_task.notified().await;
-
-        // Get the next batch from the queue
-        // This batch might be smaller than the maximum batch size if there are not enough requests
-        // waiting in the queue
-        while let Some((mut entries, batch, span)) = queue
-            .next_batch(None, max_batch_prefill_tokens, max_batch_total_tokens)
-            .await
-        {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
-                .instrument(span)
-                .await;
-            let mut waiting_tokens = 1;
-
-            // We loop until we do not receive any cached batch from the inference server (== until
-            // all requests have met their stopping criteria)
-            while let Some(batch) = cached_batch {
-                // Get current batch info
-                let batch_size = batch.size;
-                let batch_max_tokens = batch.max_tokens;
-                let mut batches = vec![batch];
-                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
-                metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
-
-                let min_size = if waiting_tokens >= max_waiting_tokens {
-                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                    // to add a new batch even though its size might be small
-                    None
-                } else {
-                    // Minimum batch size
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
-                };
-
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-
-                // Try to get a new batch
-                if let Some((mut new_entries, new_batch, span)) = queue
-                    .next_batch(min_size, max_batch_prefill_tokens, token_budget)
-                    .await
-                {
-                    // Tracking metrics
-                    if min_size.is_some() {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
-                    } else {
-                        metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
-                    }
-
-                    entries.iter_mut().for_each(|(_, entry)| {
-                        // Create a new span to add the info that this entry is waiting
-                        // because a new batch is being computed
-                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                        // Add relationships
-                        span.follows_from(&entry_waiting_span);
-                        entry_waiting_span.follows_from(&span);
-                        // Update entry
-                        entry.temp_span = Some(entry_waiting_span);
-                    });
-
-                    // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch =
-                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
-                            .instrument(span)
-                            .await;
-                    // Reset waiting counter
-                    waiting_tokens = 1;
-                    // Extend current batch with the new batch
-                    if let Some(new_cached_batch) = new_cached_batch {
-                        entries.extend(new_entries);
-                        batches.push(new_cached_batch);
-                    }
-                }
-
-                // Create span for this batch to add context to inference calls
-                let next_batch_size = entries.len();
-                let next_batch_span =
-                    info_span!(parent: None, "batch", batch_size = next_batch_size);
-                entries.iter_mut().for_each(|(_, entry)| {
-                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
-                    // Add relationships
-                    next_batch_span.follows_from(&entry_batch_span);
-                    entry_batch_span.follows_from(&next_batch_span);
-                    // Update entry
-                    entry.temp_span = Some(entry_batch_span);
-                });
-
-                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
-                    .instrument(next_batch_span)
-                    .await;
-                waiting_tokens += 1;
-            }
-            metrics::gauge!("tgi_batch_current_size", 0.0);
-            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn prefill(
-    client: &mut ShardedClient,
-    batch: Batch,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_id = batch.id;
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
-
-    match client.prefill(batch).await {
-        Ok((generations, next_batch)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            // Update health
-            generation_health.store(false, Ordering::SeqCst);
-            let _ = client.clear_cache(Some(batch_id)).await;
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
-            None
-        }
-    }
-}
-
-#[instrument(skip_all)]
-async fn decode(
-    client: &mut ShardedClient,
-    batches: Vec<CachedBatch>,
-    entries: &mut IntMap<u64, Entry>,
-    generation_health: &Arc<AtomicBool>,
-) -> Option<CachedBatch> {
-    let start_time = Instant::now();
-    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
-    metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
-
-    match client.decode(batches).await {
-        Ok((generations, next_batch)) => {
-            // Update health
-            generation_health.store(true, Ordering::SeqCst);
-            // Send generated tokens and filter stopped entries
-            filter_send_generations(generations, entries);
-
-            // Filter next batch and remove requests that were stopped
-            let next_batch = filter_batch(client, next_batch, entries).await;
-
-            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
-            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
-            next_batch
-        }
-        // If we have an error, we discard the whole batch
-        Err(err) => {
-            generation_health.store(false, Ordering::SeqCst);
-            for id in batch_ids {
-                let _ = client.clear_cache(Some(id)).await;
-            }
-            send_errors(err, entries);
-            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
-            None
-        }
-    }
-}
-
-/// Filter a `batch` and remove all requests not present in `entries`
-#[instrument(skip_all)]
-async fn filter_batch(
-    client: &mut ShardedClient,
-    next_batch: Option<CachedBatch>,
-    entries: &IntMap<u64, Entry>,
-) -> Option<CachedBatch> {
-    let mut batch = next_batch?;
-
-    // No need to filter
-    if batch.size as usize == entries.len() {
-        return Some(batch);
-    }
-
-    let id = batch.id;
-
-    // Retain only requests that are still in entries
-    batch.request_ids.retain(|id| entries.contains_key(id));
-
-    if batch.request_ids.is_empty() {
-        // All requests have been filtered out
-        // Next batch is now empty
-        // Clear it from the Python shards cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.clear_cache(Some(id)).await.unwrap();
-        None
-    } else {
-        // Filter Python shard cache
-        // We unwrap here as we need to panic since we cannot recover if this method fails
-        client.filter_batch(id, batch.request_ids).await.unwrap()
-    }
-}
-
-/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
-/// and filter entries
-#[instrument(skip_all)]
-fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
-    generations.into_iter().for_each(|generation| {
-        let id = generation.request_id;
-        // Get entry
-        // We can `expect` here as the request id should always be in the entries
-        let entry = entries
-            .get(&id)
-            .expect("ID not found in entries. This is a bug.");
-
-        // Create and enter a span to link this function back to the entry
-        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
-        // Send generation responses back to the infer task
-        // If the receive an error from the Flume channel, it means that the client dropped the
-        // request and we need to stop generating hence why we unwrap_or(true)
-        let stopped = send_responses(generation, entry).map_err(|err| {
-            if let SendTimeoutError::Timeout(_) = *err {
-                tracing::error!("Entry response channel timed out.")
-            }
-
-            metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
-            err
-        }).unwrap_or(true);
-        if stopped {
-            entries.remove(&id).expect("ID not found in entries. This is a bug.");
-        }
-    });
-}
-
-/// Send responses through the `entry` response channel
-fn send_responses(
-    generation: Generation,
-    entry: &Entry,
-) -> Result<bool, Box<SendTimeoutError<Result<InferStreamResponse, InferError>>>> {
-    // Return directly if the channel is disconnected
-    if entry.response_tx.is_disconnected() {
-        return Ok(true);
-    }
-
-    let mut stopped = false;
-
-    if let Some(prefill_tokens) = generation.prefill_tokens {
-        // Send message
-        entry.response_tx.send_timeout(
-            Ok(InferStreamResponse::Prefill(prefill_tokens)),
-            Duration::from_millis(10),
-        )?;
-    }
-
-    // Create last Token
-    let token = Token {
-        id: generation.token_id,
-        text: generation.token_text,
-        logprob: generation.token_logprob,
-        special: generation.token_is_special,
-    };
-
-    // generation.top_tokens
-
-    let mut top_tokens = Vec::new();
-    if let Some(top_tokens_) = generation.top_tokens {
-        top_tokens.extend(
-            top_tokens_
-                .ids
-                .into_iter()
-                .zip(top_tokens_.logprobs.into_iter())
-                .zip(top_tokens_.texts.into_iter())
-                .zip(top_tokens_.is_special.into_iter())
-                .map(|(((id, logprob), text), special)| Token {
-                    id,
-                    text,
-                    logprob,
-                    special,
-                }),
-        )
-    }
-
-    if let Some(generated_text) = generation.generated_text {
-        // Generation has ended
-        stopped = true;
-        // Send message
-        entry.response_tx.send_timeout(
-            Ok(InferStreamResponse::End {
-                token,
-                top_tokens,
-                generated_text,
-                queued: entry.queue_time,
-                start: entry.batch_time.unwrap(),
-            }),
-            Duration::from_millis(10),
-        )?;
-    } else {
-        // Send message
-        entry.response_tx.send_timeout(
-            Ok(InferStreamResponse::Intermediate { token, top_tokens }),
-            Duration::from_millis(10),
-        )?;
-    }
-    Ok(stopped)
-}
-
-/// Send errors to Infer for all `entries`
-#[instrument(skip_all)]
-fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
-    entries.drain().for_each(|(_, entry)| {
-        // Create and enter a span to link this function back to the entry
-        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
-        let err = InferError::GenerationError(error.to_string());
-        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
-        tracing::error!("{err}");
-
-        // unwrap_or is valid here as we don't care if the receiver is gone.
-        entry
-            .response_tx
-            .send_timeout(Err(err), Duration::from_millis(10))
-            .unwrap_or(());
-    });
-}
-
-#[derive(Debug)]
-pub(crate) enum InferStreamResponse {
-    // Optional first message
-    Prefill(PrefillTokens),
-    // Intermediate messages
-    Intermediate {
-        token: Token,
-        top_tokens: Vec<Token>,
-    },
-    // Last message
-    End {
-        token: Token,
-        top_tokens: Vec<Token>,
-        generated_text: GeneratedText,
-        start: Instant,
-        queued: Instant,
-    },
-}
-
-#[derive(Debug)]
-pub(crate) struct InferResponse {
-    pub(crate) prefill: Vec<PrefillToken>,
-    pub(crate) tokens: Vec<Token>,
-    pub(crate) generated_text: GeneratedText,
-    pub(crate) queued: Instant,
-    pub(crate) start: Instant,
-    pub(crate) top_tokens: Vec<Vec<Token>>,
-}
-
-#[derive(Debug, Error)]
-pub enum InferError {
-    #[error("Request failed during generation: {0}")]
-    GenerationError(String),
-    #[error("Model is overloaded")]
-    Overloaded(#[from] TryAcquireError),
-    #[error("Input validation error: {0}")]
-    ValidationError(#[from] ValidationError),
-    #[error("Incomplete generation")]
-    IncompleteGeneration,
-}
-
-impl InferError {
-    pub(crate) fn error_type(&self) -> &str {
-        match self {
-            InferError::GenerationError(_) => "generation",
-            InferError::Overloaded(_) => "overloaded",
-            InferError::ValidationError(_) => "validation",
-            InferError::IncompleteGeneration => "incomplete_generation",
-        }
-    }
-}
diff --git a/router/src/infer/health.rs b/router/src/infer/health.rs
new file mode 100644
index 00000000..4320c1a4
--- /dev/null
+++ b/router/src/infer/health.rs
@@ -0,0 +1,34 @@
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use text_generation_client::Health;
+
+#[derive(Clone)]
+pub(crate) struct HealthCheck {
+    client: Arc<dyn Health + Send + Sync>,
+    generation_health: Arc<AtomicBool>,
+}
+
+impl HealthCheck {
+    pub(crate) fn new(
+        client: Arc<dyn Health + Send + Sync>,
+        generation_health: Arc<AtomicBool>,
+    ) -> Self {
+        Self {
+            client,
+            generation_health,
+        }
+    }
+
+    pub(crate) async fn check(&mut self) -> bool {
+        let value = if self.generation_health.load(Ordering::SeqCst) {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok();
+        // Update generation health
+        self.generation_health.store(value, Ordering::SeqCst);
+        value
+    }
+}
diff --git a/router/src/infer/mod.rs b/router/src/infer/mod.rs
new file mode 100644
index 00000000..49282eb9
--- /dev/null
+++ b/router/src/infer/mod.rs
@@ -0,0 +1,533 @@
+mod health;
+pub(crate) mod v2;
+pub(crate) mod v3;
+
+pub(crate) use health::HealthCheck;
+
+use crate::validation::{ValidGenerateRequest, Validation, ValidationError};
+use crate::{
+    ChatTemplateInputs, ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig,
+    HubTokenizerConfig, Message, MessageChunk, PrefillToken, TextMessage, Token,
+};
+use crate::{
+    FunctionRef, FunctionsMap, GrammarType, Properties, TokenizerConfigToken, Tool, ToolType, Tools,
+};
+use futures::future::try_join_all;
+use minijinja::{Environment, ErrorKind, Template};
+use minijinja_contrib::pycompat;
+
+use serde_json::{json, Map, Value};
+use std::collections::HashMap;
+use std::sync::Arc;
+use thiserror::Error;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tokio_stream::StreamExt;
+use tracing::instrument;
+
+pub(crate) trait Scheduler {
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+        permit: OwnedSemaphorePermit,
+    ) -> Result<GenerateStreamResponse, InferError>;
+}
+
+/// Inference struct
+#[derive(Clone)]
+pub struct Infer {
+    /// Validation
+    validation: Validation,
+    /// Request scheduler
+    scheduler: Arc<dyn Scheduler + Send + Sync>,
+    /// Chat template
+    chat_template: Option<ChatTemplate>,
+    /// Inference limit
+    limit_concurrent_requests: Arc<Semaphore>,
+}
+
+impl Infer {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        scheduler: Arc<dyn Scheduler + Send + Sync>,
+        validation: Validation,
+        max_concurrent_requests: usize,
+        tokenizer_config: HubTokenizerConfig,
+        processor_config: HubProcessorConfig,
+    ) -> Self {
+        let chat_template = tokenizer_config
+            .chat_template
+            .or(processor_config.chat_template)
+            .and_then(|t| match t {
+                ChatTemplateVersions::Single(template) => Some(template),
+                ChatTemplateVersions::Multiple(templates) => templates
+                    .into_iter()
+                    .find(|t| t.name == "default")
+                    .map(|t| t.template),
+            })
+            .map(|t| ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token));
+
+        // Inference limit with a semaphore
+        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
+
+        Self {
+            validation,
+            scheduler,
+            chat_template,
+            limit_concurrent_requests: semaphore,
+        }
+    }
+
+    /// Add a new request to the queue and return a stream of InferStreamResponse
+    #[instrument(skip_all)]
+    pub(crate) async fn generate_stream(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<GenerateStreamResponse, InferError> {
+        // Limit concurrent requests by acquiring a permit from the semaphore
+        let permit = self
+            .clone()
+            .limit_concurrent_requests
+            .try_acquire_owned()
+            .map_err(|err| {
+                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
+                tracing::error!("{err}");
+                err
+            })?;
+
+        // Validate request
+        let valid_request = self.validation.validate(request).await.map_err(|err| {
+            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            tracing::error!("{err}");
+            err
+        })?;
+
+        self.scheduler.schedule(valid_request, permit)
+    }
+
+    /// Tokenizer the input
+    #[instrument(skip_all)]
+    pub(crate) async fn tokenize(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<Option<tokenizers::Encoding>, InferError> {
+        // Tokenize request
+        let inputs = request.inputs;
+        let truncate = request.parameters.truncate;
+        let encoding = self
+            .validation
+            .tokenize(inputs, truncate)
+            .await
+            .map_err(|err| {
+                tracing::error!("Tokenization {err}");
+                err
+            })?;
+
+        // Return Encoding
+        Ok(encoding.map(|(encoding, _)| encoding))
+    }
+
+    /// Apply the chat template to the chat request
+    #[instrument(skip_all)]
+    pub(crate) fn apply_chat_template(
+        &self,
+        messages: Vec<Message>,
+        grammar_with_prompt: Option<(GrammarType, String)>,
+    ) -> Result<String, InferError> {
+        self.chat_template
+            .as_ref()
+            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
+            .apply(messages, grammar_with_prompt)
+            .map_err(|e| {
+                metrics::increment_counter!("tgi_request_failure", "err" => "template");
+                tracing::error!("{e}");
+                e
+            })
+    }
+
+    /// Add a new request to the queue and return a InferResponse
+    #[instrument(skip_all)]
+    pub(crate) async fn generate(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<InferResponse, InferError> {
+        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);
+
+        // Create stream and keep semaphore permit as long as generate lives
+        let (_permit, _input_length, mut stream) = self.generate_stream(request).await?;
+
+        // Return values
+        let mut result_prefill = Vec::new();
+        let mut result_tokens = Vec::new();
+        let mut result_top_tokens = Vec::new();
+        let mut result_generated_text = None;
+        let mut result_start = None;
+        let mut result_queued = None;
+
+        // Iterate on stream
+        while let Some(response) = stream.next().await {
+            match response? {
+                // Add prefill tokens
+                InferStreamResponse::Prefill(prefill_tokens) => {
+                    result_prefill = prefill_tokens;
+                }
+                // Push last token
+                InferStreamResponse::Intermediate { token, top_tokens } => {
+                    result_tokens.push(token);
+                    result_top_tokens.push(top_tokens);
+                }
+                // Final message
+                // Set return values
+                InferStreamResponse::End {
+                    token,
+                    generated_text,
+                    start,
+                    queued,
+                    top_tokens,
+                } => {
+                    result_tokens.push(token);
+                    result_top_tokens.push(top_tokens);
+                    result_generated_text = Some(generated_text);
+                    result_start = Some(start);
+                    result_queued = Some(queued)
+                }
+            }
+        }
+
+        // Check that we received a `InferStreamResponse::End` message
+        if let (Some(generated_text), Some(queued), Some(start)) =
+            (result_generated_text, result_queued, result_start)
+        {
+            Ok(InferResponse {
+                prefill: result_prefill,
+                _input_length,
+                tokens: result_tokens,
+                generated_text,
+                queued,
+                start,
+                top_tokens: if use_top_tokens {
+                    result_top_tokens
+                } else {
+                    Vec::new()
+                },
+            })
+        } else {
+            let err = InferError::IncompleteGeneration;
+            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
+            tracing::error!("{err}");
+            Err(err)
+        }
+    }
+    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
+    /// the highest log probability per token
+    #[instrument(skip(self, request))]
+    pub(crate) async fn generate_best_of(
+        &self,
+        request: GenerateRequest,
+        best_of: usize,
+    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
+        // validate  best_of parameter separately
+        let best_of = self.validation.validate_best_of(best_of)?;
+
+        // create multiple generate requests
+        let mut infer_responses: Vec<InferResponse> =
+            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
+
+        // get the sequence with the highest log probability per token
+        let mut max_index = 0;
+        let mut max_logprob: f32 = f32::MIN;
+
+        for (i, response) in infer_responses.iter().enumerate() {
+            // mean logprobs of the generated tokens
+            let sequence_logprob = response
+                .tokens
+                .iter()
+                .map(|token| token.logprob)
+                .sum::<f32>()
+                / response.tokens.len() as f32;
+
+            // set best sequence
+            if sequence_logprob > max_logprob {
+                max_index = i;
+                max_logprob = sequence_logprob;
+            }
+        }
+        let best_response = infer_responses.remove(max_index);
+        Ok((best_response, infer_responses))
+    }
+}
+
+/// Raise a exception (custom function) used in the chat templates
+fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
+    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
+}
+
+#[derive(Clone)]
+struct ChatTemplate {
+    template: Template<'static, 'static>,
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+    use_default_tool_template: bool,
+}
+
+impl ChatTemplate {
+    fn new(
+        template: String,
+        bos_token: Option<TokenizerConfigToken>,
+        eos_token: Option<TokenizerConfigToken>,
+    ) -> Self {
+        let mut env = Box::new(Environment::new());
+        // enable things like .strip() or .capitalize()
+        env.set_unknown_method_callback(pycompat::unknown_method_callback);
+        let template_str = template.into_boxed_str();
+        env.add_function("raise_exception", raise_exception);
+
+        // check if contains the tools variable within the template
+        let use_default_tool_template =
+            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
+        // leaking env and template_str as read-only, static resources for performance.
+        let template = Box::leak(env)
+            .template_from_str(Box::leak(template_str))
+            .unwrap();
+
+        Self {
+            template,
+            bos_token: bos_token.map(|token| token.as_str().to_string()),
+            eos_token: eos_token.map(|token| token.as_str().to_string()),
+            use_default_tool_template,
+        }
+    }
+
+    fn apply(
+        &self,
+        mut messages: Vec<Message>,
+        grammar_with_prompt: Option<(GrammarType, String)>,
+    ) -> Result<String, InferError> {
+        if self.use_default_tool_template {
+            if let Some(last_message) = messages.last_mut() {
+                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
+                    last_message.content.push(MessageChunk::Text {
+                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
+                    });
+                }
+            }
+        }
+
+        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
+
+        self.template
+            .render(ChatTemplateInputs {
+                messages,
+                bos_token: self.bos_token.as_deref(),
+                eos_token: self.eos_token.as_deref(),
+                add_generation_prompt: true,
+                tools: None,
+                tools_prompt: None,
+            })
+            .map_err(InferError::TemplateError)
+    }
+}
+
+pub struct ToolGrammar {}
+
+impl ToolGrammar {
+    pub fn apply(
+        tools: Option<Vec<Tool>>,
+        tool_choice: Option<ToolType>,
+    ) -> Result<Option<Tools>, InferError> {
+        if let Some((req_tools, tool_choice)) = tools.zip(tool_choice) {
+            // let tool_prompt = tool_prompt.unwrap_or_default();
+            let tools_to_use = match tool_choice {
+                ToolType::FunctionName(name) => {
+                    vec![req_tools
+                        .iter()
+                        .find(|tool| tool.function.name == *name)
+                        .unwrap_or_else(|| panic!("Tool with name {} not found", name))
+                        .clone()]
+                }
+                ToolType::Function { function } => {
+                    let tool = req_tools
+                        .iter()
+                        .find(|tool| tool.function.name == function.name)
+                        .unwrap_or_else(|| panic!("Tool with name {} not found", function.name))
+                        .clone();
+                    vec![tool]
+                }
+                ToolType::OneOf => req_tools.to_owned(),
+            };
+
+            // adds the error notification function for LLM feedback if required
+            let mut text_response_properties = Map::new();
+            text_response_properties.insert(
+                "error".to_string(),
+                serde_json::json!({
+                    "type": "string",
+                    "description": "The error or issue to notify"
+                }),
+            );
+            text_response_properties.insert(
+                "_name".to_string(),
+                serde_json::json!({
+                    "type": "string",
+                    "const": "notify_error"
+                }),
+            );
+
+            let functions: HashMap<String, serde_json::Value> = tools_to_use
+                .iter()
+                .map(|tool| {
+                    let func = tool.function.clone();
+
+                    // Clone the existing parameters, which are expected to be a JSON object
+                    let mut params = if let Value::Object(params) = &func.arguments {
+                        params.clone()
+                    } else {
+                        Map::new()
+                    };
+
+                    // Insert the function's description at the top level, outside of properties
+                    params.insert(
+                        "description".to_string(),
+                        Value::String(func.description.clone().unwrap_or_default()),
+                    );
+
+                    // Ensure 'properties' exists and is an object
+                    let properties = params
+                        .entry("properties".to_string())
+                        .or_insert_with(|| json!({}))
+                        .as_object_mut()
+                        .unwrap();
+
+                    // Insert the constant for the function name inside 'properties'
+                    properties.insert(
+                        "_name".to_string(),
+                        json!({
+                            "type": "string",
+                            "const": func.name.clone(),
+                            // "description": "The name of the function"
+                        }),
+                    );
+
+                    // Check if 'required' exists, and it is an array. If not, create an empty array.
+                    let required = params
+                        .entry("required".to_string())
+                        .or_insert_with(|| json!([]))
+                        .as_array_mut()
+                        .unwrap();
+
+                    // Add 'name' to the 'required' array if it is not already present
+                    if !required.iter().any(|r| r == "_name") {
+                        required.push(json!("_name"));
+                    }
+
+                    (func.name, Value::Object(params))
+                })
+                .chain([(
+                    "notify_error".to_string(),
+                    serde_json::json!({
+                        "properties": text_response_properties,
+                        "required": ["error", "_name"],
+                        "type": "object"
+                    }),
+                )])
+                .collect();
+
+            let tools = Tools {
+                functions_map: FunctionsMap { functions },
+                properties: Properties {
+                    function: tools_to_use
+                        .iter()
+                        .map(|tool| FunctionRef {
+                            ref_path: format!("#/$functions/{}", tool.function.name.clone()),
+                        })
+                        .chain(std::iter::once(FunctionRef {
+                            ref_path: "#/$functions/notify_error".to_string(),
+                        }))
+                        .collect(),
+                },
+            };
+
+            return Ok(Some(tools));
+        }
+        // Err(InferError::ToolError("No tools provided".to_string()))
+        Ok(None)
+    }
+}
+
+/// Type alias for generation responses
+pub(crate) type GenerateStreamResponse = (
+    OwnedSemaphorePermit,
+    u32, // input_length
+    UnboundedReceiverStream<Result<InferStreamResponse, InferError>>,
+);
+
+#[derive(Debug)]
+pub(crate) struct GeneratedText {
+    pub(crate) text: String,
+    pub(crate) generated_tokens: u32,
+    pub(crate) finish_reason: FinishReason,
+    pub(crate) seed: Option<u64>,
+}
+
+#[derive(Debug)]
+pub(crate) enum InferStreamResponse {
+    // Optional first message
+    Prefill(Vec<PrefillToken>),
+    // Intermediate messages
+    Intermediate {
+        token: Token,
+        top_tokens: Vec<Token>,
+    },
+    // Last message
+    End {
+        token: Token,
+        top_tokens: Vec<Token>,
+        generated_text: GeneratedText,
+        start: Instant,
+        queued: Instant,
+    },
+}
+
+#[derive(Debug)]
+pub(crate) struct InferResponse {
+    /// input_length is the input as perceived by the rust tokenizer in the
+    /// validation pathway. It is redundant with prefill.len() but prefill
+    /// has data only if the user asked for it. This will always be filled.
+    pub(crate) _input_length: u32,
+    pub(crate) prefill: Vec<PrefillToken>,
+    pub(crate) tokens: Vec<Token>,
+    pub(crate) generated_text: GeneratedText,
+    pub(crate) queued: Instant,
+    pub(crate) start: Instant,
+    pub(crate) top_tokens: Vec<Vec<Token>>,
+}
+
+#[derive(Debug, Error)]
+pub enum InferError {
+    #[error("Request failed during generation: {0}")]
+    GenerationError(String),
+    #[error("Model is overloaded")]
+    Overloaded(#[from] TryAcquireError),
+    #[error("Input validation error: {0}")]
+    ValidationError(#[from] ValidationError),
+    #[error("Incomplete generation")]
+    IncompleteGeneration,
+    #[error("Template error: {0}")]
+    TemplateError(#[from] minijinja::Error),
+    #[error("Tool error: {0}")]
+    ToolError(String),
+}
+
+impl InferError {
+    pub(crate) fn error_type(&self) -> &str {
+        match self {
+            InferError::GenerationError(_) => "generation",
+            InferError::Overloaded(_) => "overloaded",
+            InferError::ValidationError(_) => "validation",
+            InferError::IncompleteGeneration => "incomplete_generation",
+            InferError::TemplateError(_) => "template_error",
+            InferError::ToolError(_) => "tool_error",
+        }
+    }
+}
diff --git a/router/src/infer/v2/mod.rs b/router/src/infer/v2/mod.rs
new file mode 100644
index 00000000..8b4f6bab
--- /dev/null
+++ b/router/src/infer/v2/mod.rs
@@ -0,0 +1,4 @@
+mod queue;
+mod scheduler;
+
+pub(crate) use scheduler::SchedulerV2;
diff --git a/router/src/queue.rs b/router/src/infer/v2/queue.rs
similarity index 61%
rename from router/src/queue.rs
rename to router/src/infer/v2/queue.rs
index e97a168e..93cf9469 100644
--- a/router/src/queue.rs
+++ b/router/src/infer/v2/queue.rs
@@ -1,10 +1,15 @@
-use crate::infer::InferError;
-use crate::infer::InferStreamResponse;
-use crate::validation::ValidGenerateRequest;
+use crate::infer::{InferError, InferStreamResponse};
+use crate::validation::{
+    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+};
 use nohash_hasher::{BuildNoHashHasher, IntMap};
+use std::cmp::min;
 use std::collections::VecDeque;
-use text_generation_client::{Batch, Request};
-use tokio::sync::oneshot;
+use text_generation_client::v2::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use text_generation_client::ChunksToString;
+use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Span};
 
@@ -14,7 +19,7 @@ pub(crate) struct Entry {
     /// Request
     pub request: ValidGenerateRequest,
     /// Response sender to communicate between the Infer struct and the batching_task
-    pub response_tx: flume::Sender<Result<InferStreamResponse, InferError>>,
+    pub response_tx: mpsc::UnboundedSender<Result<InferStreamResponse, InferError>>,
     /// Span that will live as long as entry
     pub span: Span,
     /// Temporary span used as a guard when logging inference, wait times...
@@ -29,21 +34,31 @@ pub(crate) struct Entry {
 #[derive(Debug, Clone)]
 pub(crate) struct Queue {
     /// Channel to communicate with the background queue task
-    queue_sender: flume::Sender<QueueCommand>,
+    queue_sender: mpsc::UnboundedSender<QueueCommand>,
 }
 
 impl Queue {
-    pub(crate) fn new(requires_padding: bool, block_size: u32) -> Self {
+    pub(crate) fn new(
+        requires_padding: bool,
+        block_size: u32,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
         // Create channel
-        let (queue_sender, queue_receiver) = flume::unbounded();
+        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();
 
         // Launch background queue task
-        tokio::spawn(queue_task(requires_padding, block_size, queue_receiver));
+        tokio::spawn(queue_task(
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+            queue_receiver,
+        ));
 
         Self { queue_sender }
     }
 
-    /// Append an entry to the queue
     #[instrument(skip_all)]
     pub(crate) fn append(&self, entry: Entry) {
         // Send append command to the background task managing the state
@@ -58,6 +73,7 @@ impl Queue {
     pub(crate) async fn next_batch(
         &self,
         min_size: Option<usize>,
+        max_size: Option<usize>,
         prefill_token_budget: u32,
         token_budget: u32,
     ) -> Option<NextBatch> {
@@ -68,6 +84,7 @@ impl Queue {
         self.queue_sender
             .send(QueueCommand::NextBatch {
                 min_size,
+                max_size,
                 prefill_token_budget,
                 token_budget,
                 response_sender,
@@ -84,11 +101,13 @@ impl Queue {
 async fn queue_task(
     requires_padding: bool,
     block_size: u32,
-    receiver: flume::Receiver<QueueCommand>,
+    window_size: Option<u32>,
+    speculate: u32,
+    mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
 ) {
-    let mut state = State::new(requires_padding, block_size);
+    let mut state = State::new(requires_padding, block_size, window_size, speculate);
 
-    while let Ok(cmd) = receiver.recv_async().await {
+    while let Some(cmd) = receiver.recv().await {
         match cmd {
             QueueCommand::Append(entry, span) => {
                 span.in_scope(|| state.append(*entry));
@@ -96,12 +115,14 @@ async fn queue_task(
             }
             QueueCommand::NextBatch {
                 min_size,
+                max_size,
                 prefill_token_budget,
                 token_budget,
                 response_sender,
                 span,
             } => span.in_scope(|| {
-                let next_batch = state.next_batch(min_size, prefill_token_budget, token_budget);
+                let next_batch =
+                    state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
                 response_sender.send(next_batch).unwrap();
                 metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
             }),
@@ -126,16 +147,29 @@ struct State {
 
     /// Paged Attention block size
     block_size: u32,
+
+    /// Sliding window
+    window_size: Option<u32>,
+
+    /// Speculation amount
+    speculate: u32,
 }
 
 impl State {
-    fn new(requires_padding: bool, block_size: u32) -> Self {
+    fn new(
+        requires_padding: bool,
+        block_size: u32,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
         Self {
             entries: VecDeque::with_capacity(128),
             next_id: 0,
             next_batch_id: 0,
             requires_padding,
             block_size,
+            window_size,
+            speculate,
         }
     }
 
@@ -154,20 +188,27 @@ impl State {
     fn next_batch(
         &mut self,
         min_size: Option<usize>,
+        max_size: Option<usize>,
         prefill_token_budget: u32,
         token_budget: u32,
     ) -> Option<NextBatch> {
         if self.entries.is_empty() {
+            tracing::debug!("No queue");
             return None;
         }
 
         // Check if we have enough entries
         if let Some(min_size) = min_size {
             if self.entries.len() < min_size {
+                tracing::debug!("Not enough entries");
                 return None;
             }
         }
 
+        // Pad prefill_token_budget to be a multiple of block size
+        let prefill_token_budget =
+            ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
+
         // Create span for this batch to add context to inference calls
         let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
         next_batch_span.follows_from(&Span::current());
@@ -184,8 +225,9 @@ impl State {
         while let Some((id, mut entry)) = self.entries.pop_front() {
             // Filter entries where the response receiver was dropped (== entries where the request
             // was dropped by the client)
-            if entry.response_tx.is_disconnected() {
+            if entry.response_tx.is_closed() {
                 metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+                tracing::debug!("Dropping entry");
                 continue;
             }
 
@@ -204,22 +246,30 @@ impl State {
             if self.requires_padding {
                 decode_tokens += entry.request.stopping_parameters.max_new_tokens;
             } else {
+                let max_new_tokens = match self.window_size {
+                    None => entry.request.stopping_parameters.max_new_tokens,
+                    Some(window_size) => min(
+                        window_size.saturating_sub(entry.request.input_length),
+                        entry.request.stopping_parameters.max_new_tokens,
+                    ),
+                };
+
                 // pad to block size
                 decode_tokens +=
-                    ((entry.request.stopping_parameters.max_new_tokens + self.block_size - 1)
-                        / self.block_size)
-                        * self.block_size;
+                    ((max_new_tokens + self.block_size - 1) / self.block_size) * self.block_size;
             }
 
             if prefill_tokens > prefill_token_budget
-                || (prefill_tokens + decode_tokens) > token_budget
+                || (prefill_tokens + decode_tokens + self.speculate) > token_budget
             {
                 // Entry is over budget
                 // Add it back to the front
+                tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
                 self.entries.push_front((id, entry));
                 break;
             }
 
+            tracing::debug!("Accepting entry");
             // Create a new span to link the batch back to this entry
             let entry_batch_span = info_span!(parent: &entry.span, "infer");
             // Add relationships
@@ -231,20 +281,30 @@ impl State {
             batch_requests.push(Request {
                 id,
                 prefill_logprobs: entry.request.decoder_input_details,
-                inputs: entry.request.inputs.clone(),
+                inputs: entry.request.inputs.chunks_to_string(),
                 truncate: entry.request.truncate,
-                parameters: Some(entry.request.parameters.clone()),
-                stopping_parameters: Some(entry.request.stopping_parameters.clone()),
+                parameters: Some(NextTokenChooserParameters::from(
+                    entry.request.parameters.clone(),
+                )),
+                stopping_parameters: Some(StoppingCriteriaParameters::from(
+                    entry.request.stopping_parameters.clone(),
+                )),
                 top_n_tokens: entry.request.top_n_tokens,
             });
             // Set batch_time
             entry.batch_time = Some(Instant::now());
             // Insert in batch_entries IntMap
             batch_entries.insert(id, entry);
+
+            // Check if max_size
+            if Some(batch_requests.len()) == max_size {
+                break;
+            }
         }
 
         // Empty batch
         if batch_requests.is_empty() {
+            tracing::debug!("Filtered out all entries");
             return None;
         }
 
@@ -289,6 +349,7 @@ enum QueueCommand {
     Append(Box<Entry>, Span),
     NextBatch {
         min_size: Option<usize>,
+        max_size: Option<usize>,
         prefill_token_budget: u32,
         token_budget: u32,
         response_sender: oneshot::Sender<Option<NextBatch>>,
@@ -296,25 +357,61 @@ enum QueueCommand {
     },
 }
 
+impl From<ValidParameters> for NextTokenChooserParameters {
+    fn from(value: ValidParameters) -> Self {
+        let (grammar, grammar_type) = match value.grammar {
+            None => (String::new(), GrammarType::None),
+
+            Some(grammar) => match grammar {
+                ValidGrammar::Json(grammar_string) => (grammar_string, GrammarType::Json),
+                ValidGrammar::Regex(grammar_string) => (grammar_string, GrammarType::Regex),
+            },
+        };
+
+        Self {
+            temperature: value.temperature,
+            top_k: value.top_k,
+            top_p: value.top_p,
+            typical_p: value.typical_p,
+            do_sample: value.do_sample,
+            seed: value.seed,
+            repetition_penalty: value.repetition_penalty,
+            frequency_penalty: value.frequency_penalty,
+            watermark: value.watermark,
+            grammar,
+            grammar_type: grammar_type.into(),
+        }
+    }
+}
+
+impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
+    fn from(value: ValidStoppingParameters) -> Self {
+        Self {
+            max_new_tokens: value.max_new_tokens,
+            stop_sequences: value.stop_sequences,
+            ignore_eos_token: value.ignore_eos_token,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
     use tracing::info_span;
 
     fn default_entry() -> (
         Entry,
-        flume::Receiver<Result<InferStreamResponse, InferError>>,
+        mpsc::UnboundedReceiver<Result<InferStreamResponse, InferError>>,
     ) {
-        let (response_tx, receiver_tx) = flume::unbounded();
+        let (response_tx, receiver_tx) = mpsc::unbounded_channel();
 
         let entry = Entry {
             request: ValidGenerateRequest {
-                inputs: "".to_string(),
+                inputs: vec![],
                 input_length: 0,
                 truncate: 0,
                 decoder_input_details: false,
-                parameters: NextTokenChooserParameters {
+                parameters: ValidParameters {
                     temperature: 0.0,
                     top_k: 0,
                     top_p: 0.0,
@@ -322,14 +419,17 @@ mod tests {
                     do_sample: false,
                     seed: 0,
                     repetition_penalty: 0.0,
+                    frequency_penalty: 0.0,
                     watermark: false,
+                    grammar: None,
                 },
-                stopping_parameters: StoppingCriteriaParameters {
+                stopping_parameters: ValidStoppingParameters {
                     ignore_eos_token: false,
                     max_new_tokens: 1,
                     stop_sequences: vec![],
                 },
                 top_n_tokens: 0,
+                adapter_id: None,
             },
             response_tx,
             span: info_span!("entry"),
@@ -342,7 +442,7 @@ mod tests {
 
     #[test]
     fn test_append() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None, 0);
         let (entry, _guard) = default_entry();
 
         assert_eq!(state.next_id, 0);
@@ -358,21 +458,21 @@ mod tests {
 
     #[test]
     fn test_next_batch_empty() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None, 0);
 
-        assert!(state.next_batch(None, 1, 1).is_none());
-        assert!(state.next_batch(Some(1), 1, 1).is_none());
+        assert!(state.next_batch(None, None, 1, 1).is_none());
+        assert!(state.next_batch(Some(1), None, 1, 1).is_none());
     }
 
     #[test]
     fn test_next_batch_min_size() {
-        let mut state = State::new(false, 1);
+        let mut state = State::new(false, 1, None, 0);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
         state.append(entry2);
 
-        let (entries, batch, _) = state.next_batch(None, 2, 2).unwrap();
+        let (entries, batch, _) = state.next_batch(None, None, 2, 2).unwrap();
         assert_eq!(entries.len(), 2);
         assert!(entries.contains_key(&0));
         assert!(entries.contains_key(&1));
@@ -388,7 +488,7 @@ mod tests {
         let (entry3, _guard3) = default_entry();
         state.append(entry3);
 
-        assert!(state.next_batch(Some(2), 2, 2).is_none());
+        assert!(state.next_batch(Some(2), None, 2, 2).is_none());
 
         assert_eq!(state.next_id, 3);
         assert_eq!(state.entries.len(), 1);
@@ -397,14 +497,34 @@ mod tests {
     }
 
     #[test]
-    fn test_next_batch_token_budget() {
-        let mut state = State::new(false, 1);
+    fn test_next_batch_max_size() {
+        let mut state = State::new(false, 1, None, 0);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
         state.append(entry2);
 
-        let (entries, batch, _) = state.next_batch(None, 1, 1).unwrap();
+        let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 1);
+        assert_eq!(state.next_batch_id, 1);
+    }
+
+    #[test]
+    fn test_next_batch_token_budget() {
+        let mut state = State::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, None, 1, 1).unwrap();
         assert_eq!(entries.len(), 1);
         assert!(entries.contains_key(&0));
         assert_eq!(batch.id, 0);
@@ -417,7 +537,7 @@ mod tests {
         let (entry3, _guard3) = default_entry();
         state.append(entry3);
 
-        let (entries, batch, _) = state.next_batch(None, 3, 3).unwrap();
+        let (entries, batch, _) = state.next_batch(None, None, 3, 3).unwrap();
         assert_eq!(entries.len(), 2);
         assert!(entries.contains_key(&1));
         assert!(entries.contains_key(&2));
@@ -431,28 +551,28 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_append() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None, 0);
         let (entry, _guard) = default_entry();
         queue.append(entry);
     }
 
     #[tokio::test]
     async fn test_queue_next_batch_empty() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None, 0);
 
-        assert!(queue.next_batch(None, 1, 1).await.is_none());
-        assert!(queue.next_batch(Some(1), 1, 1).await.is_none());
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
     }
 
     #[tokio::test]
     async fn test_queue_next_batch_min_size() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None, 0);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
         queue.append(entry2);
 
-        let (entries, batch, _) = queue.next_batch(None, 2, 2).await.unwrap();
+        let (entries, batch, _) = queue.next_batch(None, None, 2, 2).await.unwrap();
         assert_eq!(entries.len(), 2);
         assert!(entries.contains_key(&0));
         assert!(entries.contains_key(&1));
@@ -465,11 +585,11 @@ mod tests {
         queue.append(entry3);
 
         // Not enough requests pending
-        assert!(queue.next_batch(Some(2), 2, 2).await.is_none());
+        assert!(queue.next_batch(Some(2), None, 2, 2).await.is_none());
         // Not enough token budget
-        assert!(queue.next_batch(Some(1), 0, 0).await.is_none());
+        assert!(queue.next_batch(Some(1), None, 0, 0).await.is_none());
         // Ok
-        let (entries2, batch2, _) = queue.next_batch(Some(1), 2, 2).await.unwrap();
+        let (entries2, batch2, _) = queue.next_batch(Some(1), None, 2, 2).await.unwrap();
         assert_eq!(entries2.len(), 1);
         assert!(entries2.contains_key(&2));
         assert!(entries2.get(&2).unwrap().batch_time.is_some());
@@ -478,14 +598,30 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_queue_next_batch_token_budget() {
-        let queue = Queue::new(false, 1);
+    async fn test_queue_next_batch_max_size() {
+        let queue = Queue::new(false, 1, None, 0);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
         queue.append(entry2);
 
-        let (entries, batch, _) = queue.next_batch(None, 1, 1).await.unwrap();
+        let (entries, batch, _) = queue.next_batch(None, Some(1), 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_token_budget() {
+        let queue = Queue::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 1, 1).await.unwrap();
         assert_eq!(entries.len(), 1);
         assert!(entries.contains_key(&0));
         assert_eq!(batch.id, 0);
@@ -494,7 +630,7 @@ mod tests {
         let (entry3, _guard3) = default_entry();
         queue.append(entry3);
 
-        let (entries, batch, _) = queue.next_batch(None, 3, 3).await.unwrap();
+        let (entries, batch, _) = queue.next_batch(None, None, 3, 3).await.unwrap();
         assert_eq!(entries.len(), 2);
         assert!(entries.contains_key(&1));
         assert!(entries.contains_key(&2));
@@ -502,12 +638,31 @@ mod tests {
         assert_eq!(batch.size, 2);
     }
 
+    #[tokio::test]
+    async fn test_queue_next_batch_token_speculate() {
+        let queue = Queue::new(false, 1, None, 2);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        // Budget of 1 is not enough
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+
+        let (entries, batch, _) = queue.next_batch(None, None, 6, 6).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+    }
+
     #[tokio::test]
     async fn test_queue_next_batch_dropped_receiver() {
-        let queue = Queue::new(false, 1);
+        let queue = Queue::new(false, 1, None, 0);
         let (entry, _) = default_entry();
         queue.append(entry);
 
-        assert!(queue.next_batch(None, 1, 1).await.is_none());
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
     }
 }
diff --git a/router/src/infer/v2/scheduler.rs b/router/src/infer/v2/scheduler.rs
new file mode 100644
index 00000000..e4c3de26
--- /dev/null
+++ b/router/src/infer/v2/scheduler.rs
@@ -0,0 +1,1184 @@
+/// Batching and inference logic
+use crate::infer::v2::queue::{Entry, Queue};
+use crate::infer::{
+    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
+};
+use crate::validation::ValidGenerateRequest;
+use crate::{FinishReason, PrefillToken, Token};
+use nohash_hasher::IntMap;
+use std::sync::{
+    atomic::{AtomicBool, Ordering},
+    Arc,
+};
+use text_generation_client::v2::{Batch, CachedBatch, Generation, ShardedClient};
+use text_generation_client::ClientError;
+use tokio::sync::mpsc::error::SendError;
+use tokio::sync::{mpsc, Notify, OwnedSemaphorePermit};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{info_span, instrument, Instrument, Span};
+
+pub(crate) struct SchedulerV2 {
+    /// Request queue
+    queue: Queue,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
+}
+
+impl SchedulerV2 {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        client: ShardedClient,
+        waiting_served_ratio: f32,
+        max_batch_prefill_tokens: u32,
+        max_batch_total_tokens: u32,
+        max_waiting_tokens: usize,
+        max_batch_size: Option<usize>,
+        requires_padding: bool,
+        window_size: Option<u32>,
+        speculate: u32,
+        generation_health: Arc<AtomicBool>,
+    ) -> Self {
+        // Infer shared state
+        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
+            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
+        } else {
+            false
+        };
+        let block_size = if flashdecoding { 256 } else { 16 };
+        let queue = Queue::new(requires_padding, block_size, window_size, speculate);
+        let batching_task_notifier = Arc::new(Notify::new());
+
+        // Spawn batching background task that contains all the inference logic
+        tokio::spawn(batching_task(
+            client,
+            waiting_served_ratio,
+            max_batch_prefill_tokens,
+            max_batch_total_tokens,
+            max_waiting_tokens,
+            max_batch_size,
+            queue.clone(),
+            batching_task_notifier.clone(),
+            generation_health,
+        ));
+
+        Self {
+            queue,
+            batching_task_notifier,
+        }
+    }
+}
+
+impl Scheduler for SchedulerV2 {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+        permit: OwnedSemaphorePermit,
+    ) -> Result<GenerateStreamResponse, InferError> {
+        // MPSC channel to communicate with the background batching task
+        let (response_tx, response_rx) = mpsc::unbounded_channel();
+        let input_length = request.input_length;
+
+        // Append the request to the queue
+        self.queue.append(Entry {
+            request,
+            response_tx,
+            span: Span::current(),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+        });
+
+        // Notify the background task that we have a new entry in the queue that needs
+        // to be batched
+        self.batching_task_notifier.notify_one();
+
+        // Return stream
+        Ok((
+            permit,
+            input_length,
+            UnboundedReceiverStream::new(response_rx),
+        ))
+    }
+}
+
+/// Batching logic
+/// Will be launched in a background Tokio task
+///
+/// Batches requests and sends them to the inference server
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn batching_task(
+    mut client: ShardedClient,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: u32,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+    queue: Queue,
+    notifier: Arc<Notify>,
+    generation_health: Arc<AtomicBool>,
+) {
+    // Infinite loop
+    loop {
+        // Wait for a notification from the Infer struct
+        notifier.notified().await;
+
+        // Get the next batch from the queue
+        // This batch might be smaller than the maximum batch size if there are not enough requests
+        // waiting in the queue
+        while let Some((mut entries, batch, span)) = queue
+            .next_batch(
+                None,
+                max_batch_size,
+                max_batch_prefill_tokens,
+                max_batch_total_tokens,
+            )
+            .await
+        {
+            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
+                .instrument(span)
+                .await;
+            let mut waiting_tokens = 1;
+
+            // We loop until we do not receive any cached batch from the inference server (== until
+            // all requests have met their stopping criteria)
+            while let Some(batch) = cached_batch {
+                // Get current batch info
+                let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
+                let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
+                metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
+
+                let min_size = if waiting_tokens >= max_waiting_tokens {
+                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                    // to add a new batch even though its size might be small
+                    None
+                } else {
+                    // Minimum batch size
+                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                };
+
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
+
+                // Try to get a new batch
+                if let Some((mut new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
+                    .await
+                {
+                    // Tracking metrics
+                    if min_size.is_some() {
+                        metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
+                    } else {
+                        metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
+                    }
+
+                    entries.iter_mut().for_each(|(_, entry)| {
+                        // Create a new span to add the info that this entry is waiting
+                        // because a new batch is being computed
+                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                        // Add relationships
+                        span.follows_from(&entry_waiting_span);
+                        entry_waiting_span.follows_from(&span);
+                        // Update entry
+                        entry.temp_span = Some(entry_waiting_span);
+                    });
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch =
+                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
+                            .instrument(span)
+                            .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        entries.extend(new_entries);
+                        batches.push(new_cached_batch);
+                    }
+                }
+
+                // Create span for this batch to add context to inference calls
+                let next_batch_size = entries.len();
+                let next_batch_span =
+                    info_span!(parent: None, "batch", batch_size = next_batch_size);
+                entries.iter_mut().for_each(|(_, entry)| {
+                    // Create a new span to link the batch back to this entry
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
+                    entry_batch_span.follows_from(&next_batch_span);
+                    // Update entry
+                    entry.temp_span = Some(entry_batch_span);
+                });
+
+                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
+                    .instrument(next_batch_span)
+                    .await;
+                waiting_tokens += 1;
+            }
+            metrics::gauge!("tgi_batch_current_size", 0.0);
+            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
+    entries: &mut IntMap<u64, Entry>,
+    generation_health: &Arc<AtomicBool>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_id = batch.id;
+    metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
+
+    match client.prefill(batch).await {
+        Ok((generations, next_batch, timings)) => {
+            // Update health
+            generation_health.store(true, Ordering::SeqCst);
+
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
+            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
+            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
+            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
+            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            // Update health
+            generation_health.store(false, Ordering::SeqCst);
+            let _ = client.clear_cache(Some(batch_id)).await;
+            send_errors(err, entries);
+            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+    generation_health: &Arc<AtomicBool>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
+    metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch, timings)) => {
+            // Update health
+            generation_health.store(true, Ordering::SeqCst);
+
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
+            }
+            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
+            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
+            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
+            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
+            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            generation_health.store(false, Ordering::SeqCst);
+            for id in batch_ids {
+                let _ = client.clear_cache(Some(id)).await;
+            }
+            send_errors(err, entries);
+            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
+            None
+        }
+    }
+}
+
+/// Filter a `batch` and remove all requests not present in `entries`
+#[instrument(skip_all)]
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<CachedBatch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
+    batch.request_ids.retain(|id| entries.contains_key(id));
+
+    if batch.request_ids.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.request_ids).await.unwrap()
+    }
+}
+
+/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
+/// and filter entries
+#[instrument(skip_all)]
+fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
+    generations.into_iter().for_each(|generation| {
+        let id = generation.request_id;
+        // Get entry
+        // We can `expect` here as the request id should always be in the entries
+        let entry = entries
+            .get(&id)
+            .expect("ID not found in entries. This is a bug.");
+
+        // Create and enter a span to link this function back to the entry
+        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
+        // Send generation responses back to the infer task
+        // If the receive an error from the Flume channel, it means that the client dropped the
+        // request and we need to stop generating hence why we unwrap_or(true)
+        let stopped = send_responses(generation, entry).map_err(|err| {
+            tracing::error!("Entry response channel error.");
+            metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+            err
+        }).unwrap_or(true);
+        if stopped {
+            entries.remove(&id).expect("ID not found in entries. This is a bug.");
+        }
+    });
+}
+
+/// Send responses through the `entry` response channel
+fn send_responses(
+    generation: Generation,
+    entry: &Entry,
+) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
+    // Return directly if the channel is disconnected
+    if entry.response_tx.is_closed() {
+        metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+        return Ok(true);
+    }
+
+    let mut stopped = false;
+
+    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
+        // Send message
+        entry
+            .response_tx
+            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
+    }
+
+    // Create last Token
+    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
+    let n = tokens_.ids.len();
+    metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
+    let mut iterator = tokens_
+        .ids
+        .into_iter()
+        .zip(tokens_.logprobs)
+        .zip(tokens_.texts)
+        .zip(tokens_.is_special)
+        .enumerate()
+        .peekable();
+    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
+        let token = Token {
+            id,
+            text,
+            logprob,
+            special,
+        };
+        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
+            top_tokens_
+                .ids
+                .iter()
+                .zip(top_tokens_.logprobs.iter())
+                .zip(top_tokens_.texts.iter())
+                .zip(top_tokens_.is_special.iter())
+                .map(|(((&id, &logprob), text), &special)| Token {
+                    id,
+                    text: text.to_string(),
+                    logprob,
+                    special,
+                })
+                .collect()
+        } else {
+            vec![]
+        };
+        match (&generation.generated_text, iterator.peek()) {
+            (Some(generated_text), None) => {
+                // Generation has ended
+                stopped = true;
+                // Send message
+                entry.response_tx.send(Ok(InferStreamResponse::End {
+                    token,
+                    top_tokens,
+                    generated_text: GeneratedText::from(generated_text.clone()),
+                    queued: entry.queue_time,
+                    start: entry.batch_time.unwrap(),
+                }))?;
+            }
+            _ => {
+                // Send message
+                entry
+                    .response_tx
+                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
+            }
+        }
+    }
+
+    Ok(stopped)
+}
+
+/// Send errors to Infer for all `entries`
+#[instrument(skip_all)]
+fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
+    entries.drain().for_each(|(_, entry)| {
+        // Create and enter a span to link this function back to the entry
+        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
+        let err = InferError::GenerationError(error.to_string());
+        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
+        tracing::error!("{err}");
+
+        // unwrap_or is valid here as we don't care if the receiver is gone.
+        entry
+            .response_tx
+            .send(Err(err))
+            .unwrap_or(());
+    });
+}
+
+impl From<text_generation_client::v2::GeneratedText> for GeneratedText {
+    fn from(value: text_generation_client::v2::GeneratedText) -> Self {
+        let v2_finish_reason =
+            text_generation_client::v2::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v2_finish_reason {
+            text_generation_client::v2::FinishReason::Length => FinishReason::Length,
+            text_generation_client::v2::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            text_generation_client::v2::FinishReason::StopSequence => FinishReason::StopSequence,
+        };
+
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
+        }
+    }
+}
+
+// tests
+#[cfg(test)]
+mod tests {
+    use crate::infer::raise_exception;
+    use crate::{ChatTemplateInputs, TextMessage};
+    use minijinja::Environment;
+
+    #[test]
+    fn test_chat_template() {
+        let env = Environment::new();
+
+        let source = r#"
+        {% for message in messages %}
+            {% if message['role'] == 'system' %}
+                {% if message['content']%}
+                    {{'### System:\n' + message['content']+'\n\n'}}
+                {% endif %}
+            {% elif message['role'] == 'user' %}
+                {{'### User:\n' + message['content']+'\n\n'}}
+            {% elif message['role'] == 'assistant' %}
+                {{'### Assistant:\n'  + message['content']}}
+            {% endif %}
+            {% if loop.last and add_generation_prompt %}
+                {{ '### Assistant:\n' }}
+            {% endif %}
+        {% endfor %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+
+        assert_eq!(
+            result,
+            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\nmagic!### Assistant:\n"
+        );
+    }
+
+    #[test]
+    fn test_chat_template_invalid_with_raise() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+
+        let source = r#"
+        {{ bos_token }}
+        {% for message in messages %}
+        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+        {% endif %}
+        {% if message['role'] == 'user' %}
+        {{ '[INST] ' + message['content'] + ' [/INST]' }}
+        {% elif message['role'] == 'assistant' %}
+        {{ message['content'] + eos_token}}
+        {% else %}
+        {{ raise_exception('Only user and assistant roles are supported!') }}
+        {% endif %}
+        {% endfor %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi again!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs); //.err().unwrap();
+
+        match result {
+            Ok(_) => panic!("Should have failed"),
+            Err(e) => {
+                assert_eq!(
+                    e.detail().unwrap(),
+                    "Conversation roles must alternate user/assistant/user/assistant/..."
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_chat_template_valid_with_raise() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+
+        let source = r#"
+        {{ bos_token }}
+        {% for message in messages %}
+        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+        {% endif %}
+        {% if message['role'] == 'user' %}
+        {{ '[INST] ' + message['content'] + ' [/INST]' }}
+        {% elif message['role'] == 'assistant' %}
+        {{ message['content'] + eos_token}}
+        {% else %}
+        {{ raise_exception('Only user and assistant roles are supported!') }}
+        {% endif %}
+        {% endfor %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+        assert_eq!(result, "[BOS][INST] Hi! [/INST]Hello how can I help?[EOS][INST] What is Deep Learning? [/INST]magic![EOS]");
+    }
+
+    #[test]
+    fn test_chat_template_valid_with_add_generation_prompt() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+
+        let source = r#"
+        {% for message in messages %}
+        {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
+        {% endfor %}
+        {% if add_generation_prompt %}
+            {{ '<|im_start|>assistant\n' }}
+        {% endif %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+        assert_eq!(result, "<|im_start|>user\nHi!<|im_end|>\n<|im_start|>assistant\nHello how can I help?<|im_end|>\n<|im_start|>user\nWhat is Deep Learning?<|im_end|>\n<|im_start|>assistant\nmagic!<|im_end|>\n<|im_start|>assistant\n");
+    }
+
+    struct ChatTemplateTestItem {
+        name: &'static str,
+        chat_template: &'static str,
+        input: ChatTemplateInputs<'static>,
+        target: &'static str,
+    }
+
+    #[test]
+    fn test_many_chat_templates() {
+        let example_chat = vec![
+            TextMessage {
+                role: "user".to_string(),
+                content: "Hello, how are you?".to_string(),
+            },
+            TextMessage {
+                role: "assistant".to_string(),
+                content: "I'm doing great. How can I help you today?".to_string(),
+            },
+            TextMessage {
+                role: "user".to_string(),
+                content: "I'd like to show off how chat templating works!".to_string(),
+            },
+        ];
+
+        let example_chat_with_system = [TextMessage {
+            role: "system".to_string(),
+            content: "You are a friendly chatbot who always responds in the style of a pirate"
+                .to_string(),
+        }]
+        .iter()
+        .chain(&example_chat)
+        .cloned()
+        .collect::<Vec<_>>();
+
+        let test_default_templates = vec![
+            ChatTemplateTestItem {
+                name: "_base",
+                chat_template: "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some(""),
+                    ..Default::default()
+                },
+                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "blenderbot",
+                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "blenderbot_small",
+                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "bloom",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "gpt_neox",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
+            },
+            ChatTemplateTestItem {
+                name: "gpt2",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
+            },
+            ChatTemplateTestItem {
+                name: "llama",
+                // NOTE: the `.strip()` has been replaced with `| trim` in the following template
+                chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token +'[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content | trim + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: true,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>[INST] <<SYS>>\nYou are a friendly chatbot who always responds in the style of a pirate\n<</SYS>>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "whisper",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: true,
+                    bos_token: Some(""),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
+            },
+        ];
+
+        #[allow(unused_variables)] // name is unused
+        for ChatTemplateTestItem {
+            name,
+            chat_template,
+            input,
+            target,
+        } in test_default_templates
+        {
+            let mut env = Environment::new();
+            env.add_function("raise_exception", raise_exception);
+            let tmpl = env.template_from_str(chat_template);
+            let result = tmpl.unwrap().render(input).unwrap();
+            assert_eq!(result, target);
+        }
+
+        let test_custom_templates = vec![
+            ChatTemplateTestItem {
+                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=false)",
+                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHello, how are you?</s><|assistant|>\nI'm doing great. How can I help you today?</s><|user|>\nI'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=true)",
+                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: vec![
+                        TextMessage {
+                            role: "system".to_string(),
+                            content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
+                        },
+                        TextMessage {
+                            role: "user".to_string(),
+                            content: "How many helicopters can a human eat in one sitting?".to_string(),
+                        },
+                    ],
+                    add_generation_prompt: true,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHow many helicopters can a human eat in one sitting?</s><|assistant|>",
+            },
+            ChatTemplateTestItem {
+                name: "HuggingFaceH4/zephyr-7b-gemma-v0.1",
+                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<bos>"),
+                    eos_token: Some("<eos>"),
+                    ..Default::default()
+                },
+                target: "<bos><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "mistralai/Mistral-7B-Instruct-v0.1",
+                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "cognitivecomputations/dolphin-2.5-mixtral-8x7b",
+                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "openchat/openchat-3.5-0106",
+                // `.title()` has been replaced with `| upper` in the following template
+                chat_template: "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + (message['role'] | title) + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>",
+            },
+            ChatTemplateTestItem {
+                name: "upstage/SOLAR-10.7B-Instruct-v1.0",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "codellama/CodeLlama-70b-Instruct-hf",
+                // NOTE: `.strip()` has been replaced with `| trim` in the following template
+                chat_template: "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\\n\\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\\nDestination: user\\n\\n '}}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>Source: user\n\n Hello, how are you? <step> Source: assistant\n\n I'm doing great. How can I help you today? <step> Source: user\n\n I'd like to show off how chat templating works! <step> Source: assistant\nDestination: user\n\n ",
+            },
+            ChatTemplateTestItem {
+                name: "Deci/DeciLM-7B-instruct",
+                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "### User:\nHello, how are you?### Assistant:\nI'm doing great. How can I help you today?### User:\nI'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "Qwen/Qwen1.5-72B-Chat",
+                chat_template: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "deepseek-ai/deepseek-llm-7b-chat",
+                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\\n\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<｜begin▁of▁sentence｜>"),
+                    eos_token: Some("<｜end▁of▁sentence｜>"),
+                    ..Default::default()
+                },
+                target: "<｜begin▁of▁sentence｜>User: Hello, how are you?\n\nAssistant: I'm doing great. How can I help you today?<｜end▁of▁sentence｜>User: I'd like to show off how chat templating works!\n\n",
+            },
+            ChatTemplateTestItem {
+                name: "h2oai/h2o-danube-1.8b-chat",
+                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|prompt|>Hello, how are you?</s><|answer|>I'm doing great. How can I help you today?</s><|prompt|>I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "internlm/internlm2-chat-7b",
+                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "TheBloke/deepseek-coder-33B-instruct-AWQ",
+                chat_template: "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<｜begin▁of▁sentence｜>"),
+                    eos_token: Some("<|EOT|>"),
+                    ..Default::default()
+                },
+                target: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n### Response:\n",
+            },
+            ChatTemplateTestItem {
+                name: "ericzzz/falcon-rw-1b-chat",
+                // `.strip()` has been replaced with `| trim` in the following template
+                chat_template: "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'] | trim }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<|endoftext|>"),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "[INST] Hello, how are you? [RESP] I'm doing great. How can I help you today?<|endoftext|>[INST] I'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "abacusai/Smaug-34B-v0.1",
+                chat_template: "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "maywell/Synatra-Mixtral-8x7B",
+                chat_template: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Below is an instruction that describes a task. Write a response that appropriately completes the request.### Instruction:Hello, how are you?### Response:I'm doing great. How can I help you today?### Instruction:I'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "deepseek-ai/deepseek-coder-33b-instruct",
+                chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<｜begin▁of▁sentence｜>"),
+                    eos_token: Some("</EOT>"),
+                    ..Default::default()
+                },
+                target: "<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n",
+            },
+            // NOT INCLUDED
+            // - meetkai/functionary-medium-v2.2
+            // - fireworks-ai/firefunction-v1
+            // https://github
+            ChatTemplateTestItem {
+                name: "maywell/PiVoT-MoE",
+                chat_template: "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content']|trim }}{% elif message['role'] == 'user' %}### Instruction: {{ message['content']|trim }}{% elif message['role'] == 'assistant' %}### Response: {{ message['content']|trim }}{% elif message['role'] == 'user_context' %}### Input: {{ message['content']|trim }}{% endif %}{% if not loop.last %}\n{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}### Response:{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
+            },
+        ];
+
+        #[allow(unused_variables)] // name is unused
+        for ChatTemplateTestItem {
+            name,
+            chat_template,
+            input,
+            target,
+        } in test_custom_templates
+        {
+            let mut env = Environment::new();
+            env.add_function("raise_exception", raise_exception);
+            // trim all the whitespace
+            let chat_template = chat_template
+                .lines()
+                .map(|line| line.trim())
+                .collect::<Vec<&str>>()
+                .join("");
+
+            let tmpl = env.template_from_str(&chat_template);
+            let result = tmpl.unwrap().render(input).unwrap();
+            assert_eq!(result, target);
+        }
+    }
+}
diff --git a/router/src/infer/v3/block_allocator.rs b/router/src/infer/v3/block_allocator.rs
new file mode 100644
index 00000000..7467fd85
--- /dev/null
+++ b/router/src/infer/v3/block_allocator.rs
@@ -0,0 +1,136 @@
+use std::cmp::min;
+use tokio::sync::{mpsc, oneshot};
+
+#[derive(Debug, Clone)]
+pub(crate) struct BlockAllocation {
+    pub blocks: Vec<u32>,
+    pub slots: Vec<u32>,
+    block_allocator: BlockAllocator,
+}
+
+impl Drop for BlockAllocation {
+    fn drop(&mut self) {
+        self.block_allocator.free(self.blocks.clone())
+    }
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct BlockAllocator {
+    /// Channel to communicate with the background task
+    block_allocator: mpsc::UnboundedSender<BlockAllocatorCommand>,
+}
+
+impl BlockAllocator {
+    pub(crate) fn new(
+        max_batch_total_tokens: u32,
+        block_size: u32,
+        window_size: Option<u32>,
+    ) -> Self {
+        // Create channel
+        let (sender, receiver) = mpsc::unbounded_channel();
+
+        // Launch background queue task
+        tokio::spawn(block_allocator_task(
+            max_batch_total_tokens / block_size,
+            block_size,
+            window_size,
+            receiver,
+        ));
+
+        Self {
+            block_allocator: sender,
+        }
+    }
+
+    pub(crate) async fn allocate(&self, tokens: u32) -> Option<BlockAllocation> {
+        let (response_sender, response_receiver) = oneshot::channel();
+        self.block_allocator
+            .send(BlockAllocatorCommand::Allocate {
+                tokens,
+                response_sender,
+            })
+            .unwrap();
+
+        response_receiver
+            .await
+            .unwrap()
+            .map(|(blocks, slots)| BlockAllocation {
+                blocks,
+                slots,
+                block_allocator: self.clone(),
+            })
+    }
+
+    pub(crate) fn free(&self, blocks: Vec<u32>) {
+        self.block_allocator
+            .send(BlockAllocatorCommand::Free { blocks })
+            .unwrap();
+    }
+}
+
+async fn block_allocator_task(
+    blocks: u32,
+    block_size: u32,
+    window_size: Option<u32>,
+    mut receiver: mpsc::UnboundedReceiver<BlockAllocatorCommand>,
+) {
+    // Block 0 is reserved for health checks
+    let mut free_blocks: Vec<u32> = (1..blocks).collect();
+    while let Some(cmd) = receiver.recv().await {
+        match cmd {
+            BlockAllocatorCommand::Free { blocks } => free_blocks.extend(blocks),
+            BlockAllocatorCommand::Allocate {
+                tokens,
+                response_sender,
+            } => {
+                // Apply window size
+                let (required_blocks, repeats) = {
+                    let (tokens, repeats) = match window_size {
+                        None => (tokens, 1),
+                        Some(window_size) => {
+                            let repeats = (tokens + window_size - 1) / window_size;
+                            let tokens = min(tokens, window_size);
+                            (tokens, repeats as usize)
+                        }
+                    };
+                    // Pad to a multiple of block size
+                    let required_blocks = (tokens + block_size - 1) / block_size;
+                    (required_blocks, repeats)
+                };
+
+                let tokens = tokens as usize;
+                let allocation = if required_blocks > free_blocks.len() as u32 {
+                    None
+                } else {
+                    let blocks =
+                        free_blocks.split_off(free_blocks.len() - required_blocks as usize);
+                    let mut slots = Vec::with_capacity(
+                        (required_blocks * block_size * repeats as u32) as usize,
+                    );
+
+                    'slots: for block_id in blocks.repeat(repeats).iter() {
+                        for s in (block_id * block_size)..((block_id + 1) * block_size) {
+                            slots.push(s);
+                            if slots.len() == tokens {
+                                break 'slots;
+                            }
+                        }
+                    }
+                    Some((blocks, slots))
+                };
+                response_sender.send(allocation).unwrap();
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+enum BlockAllocatorCommand {
+    Free {
+        blocks: Vec<u32>,
+    },
+    Allocate {
+        tokens: u32,
+        response_sender: oneshot::Sender<Option<(Vec<u32>, Vec<u32>)>>,
+    },
+}
diff --git a/router/src/infer/v3/mod.rs b/router/src/infer/v3/mod.rs
new file mode 100644
index 00000000..f9effab8
--- /dev/null
+++ b/router/src/infer/v3/mod.rs
@@ -0,0 +1,5 @@
+mod block_allocator;
+mod queue;
+mod scheduler;
+
+pub(crate) use scheduler::SchedulerV3;
diff --git a/router/src/infer/v3/queue.rs b/router/src/infer/v3/queue.rs
new file mode 100644
index 00000000..ba65b9b6
--- /dev/null
+++ b/router/src/infer/v3/queue.rs
@@ -0,0 +1,732 @@
+use crate::infer::v3::block_allocator::{BlockAllocation, BlockAllocator};
+use crate::infer::InferError;
+use crate::infer::InferStreamResponse;
+use crate::validation::{
+    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+};
+use nohash_hasher::{BuildNoHashHasher, IntMap};
+use std::cmp::{max, min};
+use std::collections::VecDeque;
+use text_generation_client::v3::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use text_generation_client::ChunksToString;
+use text_generation_client::Input;
+use tokio::sync::{mpsc, oneshot};
+use tokio::time::Instant;
+use tracing::{info_span, instrument, Instrument, Span};
+
+/// Queue entry
+#[derive(Debug)]
+pub(crate) struct Entry {
+    /// Request
+    pub request: ValidGenerateRequest,
+    /// Response sender to communicate between the Infer struct and the batching_task
+    pub response_tx: mpsc::UnboundedSender<Result<InferStreamResponse, InferError>>,
+    /// Span that will live as long as entry
+    pub span: Span,
+    /// Temporary span used as a guard when logging inference, wait times...
+    pub temp_span: Option<Span>,
+    /// Instant when this entry was queued
+    pub queue_time: Instant,
+    /// Instant when this entry was added to a batch
+    pub batch_time: Option<Instant>,
+    /// Block Allocation
+    pub block_allocation: Option<BlockAllocation>,
+}
+
+/// Request Queue
+#[derive(Debug, Clone)]
+pub(crate) struct Queue {
+    /// Channel to communicate with the background queue task
+    queue_sender: mpsc::UnboundedSender<QueueCommand>,
+}
+
+impl Queue {
+    pub(crate) fn new(
+        requires_padding: bool,
+        block_size: u32,
+        window_size: Option<u32>,
+        speculate: u32,
+        max_batch_total_tokens: u32,
+    ) -> Self {
+        // Create channel
+        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();
+
+        // Launch background queue task
+        tokio::spawn(queue_task(
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+            max_batch_total_tokens,
+            queue_receiver,
+        ));
+
+        Self { queue_sender }
+    }
+
+    /// Append an entry to the queue
+    #[instrument(skip_all)]
+    pub(crate) fn append(&self, entry: Entry) {
+        // Send append command to the background task managing the state
+        // Unwrap is safe here
+        self.queue_sender
+            .send(QueueCommand::Append(Box::new(entry), Span::current()))
+            .unwrap();
+    }
+
+    // Get the next batch
+    #[instrument(skip(self))]
+    pub(crate) async fn next_batch(
+        &self,
+        min_size: Option<usize>,
+        max_size: Option<usize>,
+        prefill_token_budget: u32,
+        token_budget: u32,
+    ) -> Option<NextBatch> {
+        // Create response channel
+        let (response_sender, response_receiver) = oneshot::channel();
+        // Send next batch command to the background task managing the state
+        // Unwrap is safe here
+        self.queue_sender
+            .send(QueueCommand::NextBatch {
+                min_size,
+                max_size,
+                prefill_token_budget,
+                token_budget,
+                response_sender,
+                span: Span::current(),
+            })
+            .unwrap();
+        // Await on response channel
+        // Unwrap is safe here
+        response_receiver.await.unwrap()
+    }
+}
+
+// Background task responsible of the queue state
+async fn queue_task(
+    requires_padding: bool,
+    block_size: u32,
+    window_size: Option<u32>,
+    speculate: u32,
+    max_batch_total_tokens: u32,
+    mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
+) {
+    let mut state = State::new(
+        requires_padding,
+        block_size,
+        window_size,
+        speculate,
+        max_batch_total_tokens,
+    );
+
+    while let Some(cmd) = receiver.recv().await {
+        match cmd {
+            QueueCommand::Append(entry, span) => {
+                span.in_scope(|| state.append(*entry));
+                metrics::increment_gauge!("tgi_queue_size", 1.0);
+            }
+            QueueCommand::NextBatch {
+                min_size,
+                max_size,
+                prefill_token_budget,
+                token_budget,
+                response_sender,
+                span,
+            } => {
+                let next_batch = state
+                    .next_batch(min_size, max_size, prefill_token_budget, token_budget)
+                    .instrument(span)
+                    .await;
+                response_sender.send(next_batch).unwrap();
+                metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
+            }
+        }
+    }
+}
+
+/// Queue State
+#[derive(Debug)]
+struct State {
+    /// Queue entries organized in a Vec
+    entries: VecDeque<(u64, Entry)>,
+
+    /// Id of the next entry
+    next_id: u64,
+
+    /// Id of the next batch
+    next_batch_id: u64,
+
+    /// Paged Attention block size
+    block_size: u32,
+
+    /// Sliding window
+    window_size: Option<u32>,
+
+    /// Speculation amount
+    speculate: u32,
+
+    /// Paged Attention Block Allocation
+    block_allocator: Option<BlockAllocator>,
+}
+
+impl State {
+    fn new(
+        requires_padding: bool,
+        block_size: u32,
+        window_size: Option<u32>,
+        speculate: u32,
+        max_batch_total_tokens: u32,
+    ) -> Self {
+        let block_allocator = (!requires_padding)
+            .then(|| BlockAllocator::new(max_batch_total_tokens, block_size, window_size));
+
+        Self {
+            entries: VecDeque::with_capacity(128),
+            next_id: 0,
+            next_batch_id: 0,
+            block_size,
+            window_size,
+            speculate,
+            block_allocator,
+        }
+    }
+
+    /// Append an entry to the queue
+    fn append(&mut self, mut entry: Entry) {
+        // Create a span that will live as long as the entry is in the queue waiting to be batched
+        let queue_span = info_span!(parent: &entry.span, "queued");
+        entry.temp_span = Some(queue_span);
+
+        // Push entry in the queue
+        self.entries.push_back((self.next_id, entry));
+        self.next_id += 1;
+    }
+
+    // Get the next batch
+    async fn next_batch(
+        &mut self,
+        min_size: Option<usize>,
+        max_size: Option<usize>,
+        prefill_token_budget: u32,
+        token_budget: u32,
+    ) -> Option<NextBatch> {
+        if self.entries.is_empty() {
+            tracing::debug!("No queue");
+            return None;
+        }
+
+        // Check if we have enough entries
+        if let Some(min_size) = min_size {
+            if self.entries.len() < min_size {
+                tracing::debug!("Not enough entries");
+                return None;
+            }
+        }
+
+        // Pad prefill_token_budget to be a multiple of block size
+        let prefill_token_budget =
+            ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
+
+        // Create span for this batch to add context to inference calls
+        let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
+        next_batch_span.follows_from(&Span::current());
+
+        let mut batch_requests = Vec::with_capacity(self.entries.len());
+        let mut batch_entries =
+            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
+
+        let mut max_input_length = 0;
+        let mut prefill_tokens: u32 = 0;
+        let mut decode_tokens: u32 = 0;
+        let mut max_blocks = 0;
+
+        // Pop entries starting from the front of the queue
+        'entry_loop: while let Some((id, mut entry)) = self.entries.pop_front() {
+            // Filter entries where the response receiver was dropped (== entries where the request
+            // was dropped by the client)
+            if entry.response_tx.is_closed() {
+                metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+                tracing::debug!("Dropping entry");
+                continue;
+            }
+
+            let block_allocation = match &self.block_allocator {
+                None => {
+                    // We pad to max input length in the Python shards
+                    // We need to take these padding tokens into the equation
+                    max_input_length = max_input_length.max(entry.request.input_length);
+                    prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length;
+
+                    decode_tokens += entry.request.stopping_parameters.max_new_tokens;
+                    let total_tokens = prefill_tokens + decode_tokens + self.speculate;
+
+                    if prefill_tokens > prefill_token_budget || total_tokens > token_budget {
+                        // Entry is over budget
+                        // Add it back to the front
+                        tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
+                        self.entries.push_front((id, entry));
+                        break 'entry_loop;
+                    }
+                    None
+                }
+                Some(block_allocator) => {
+                    prefill_tokens += entry.request.input_length;
+                    let max_new_tokens = match self.window_size {
+                        None => entry.request.stopping_parameters.max_new_tokens,
+                        Some(window_size) => min(
+                            window_size.saturating_sub(entry.request.input_length),
+                            entry.request.stopping_parameters.max_new_tokens,
+                        ),
+                    };
+                    decode_tokens += max_new_tokens;
+
+                    if prefill_tokens > prefill_token_budget
+                        || (prefill_tokens + decode_tokens + self.speculate) > token_budget
+                    {
+                        // Entry is over budget
+                        // Add it back to the front
+                        tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
+                        self.entries.push_front((id, entry));
+                        break;
+                    }
+
+                    let tokens = entry.request.input_length
+                        + entry.request.stopping_parameters.max_new_tokens
+                        + self.speculate
+                        - 1;
+
+                    match block_allocator.allocate(tokens).await {
+                        None => {
+                            // Entry is over budget
+                            // Add it back to the front
+                            tracing::debug!("Over budget: not enough free blocks");
+                            self.entries.push_front((id, entry));
+                            break 'entry_loop;
+                        }
+                        Some(block_allocation) => {
+                            tracing::debug!("Allocation: {block_allocation:?}");
+                            max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
+                            Some(block_allocation)
+                        }
+                    }
+                }
+            };
+
+            tracing::debug!("Accepting entry");
+            // Create a new span to link the batch back to this entry
+            let entry_batch_span = info_span!(parent: &entry.span, "infer");
+            // Add relationships
+            next_batch_span.follows_from(&entry_batch_span);
+            entry_batch_span.follows_from(&next_batch_span);
+            // Update entry
+            entry.temp_span = Some(entry_batch_span);
+
+            let (blocks, slots) = match &block_allocation {
+                None => (Vec::new(), Vec::new()),
+                Some(block_allocation) => (
+                    block_allocation.blocks.clone(),
+                    block_allocation.slots.clone(),
+                ),
+            };
+
+            entry.block_allocation = block_allocation;
+
+            batch_requests.push(Request {
+                id,
+                prefill_logprobs: entry.request.decoder_input_details,
+                input_chunks: Some(Input {
+                    chunks: entry.request.inputs.clone(),
+                }),
+                inputs: entry.request.inputs.chunks_to_string(),
+                truncate: entry.request.truncate,
+                parameters: Some(NextTokenChooserParameters::from(
+                    entry.request.parameters.clone(),
+                )),
+                stopping_parameters: Some(StoppingCriteriaParameters::from(
+                    entry.request.stopping_parameters.clone(),
+                )),
+                top_n_tokens: entry.request.top_n_tokens,
+                blocks,
+                slots,
+                adapter_id: entry.request.adapter_id.clone(),
+            });
+            // Set batch_time
+            entry.batch_time = Some(Instant::now());
+            // Insert in batch_entries IntMap
+            batch_entries.insert(id, entry);
+
+            // Check if max_size
+            if Some(batch_requests.len()) == max_size {
+                break;
+            }
+        }
+
+        // Empty batch
+        if batch_requests.is_empty() {
+            tracing::debug!("Filterered out all entries");
+            return None;
+        }
+
+        // Check if our batch is big enough
+        if let Some(min_size) = min_size {
+            // Batch is too small
+            if batch_requests.len() < min_size {
+                // Add back entries to the queue in the correct order
+                for r in batch_requests.into_iter().rev() {
+                    let id = r.id;
+                    let entry = batch_entries.remove(&id).unwrap();
+                    self.entries.push_front((id, entry));
+                }
+
+                return None;
+            }
+        }
+
+        // Final batch size
+        let size = batch_requests.len() as u32;
+        next_batch_span.record("batch_size", size);
+
+        let batch = Batch {
+            id: self.next_batch_id,
+            requests: batch_requests,
+            size,
+            max_tokens: (prefill_tokens + decode_tokens),
+            max_blocks,
+        };
+        // Increment batch id
+        self.next_batch_id += 1;
+
+        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
+
+        Some((batch_entries, batch, next_batch_span))
+    }
+}
+
+type NextBatch = (IntMap<u64, Entry>, Batch, Span);
+
+#[derive(Debug)]
+enum QueueCommand {
+    Append(Box<Entry>, Span),
+    NextBatch {
+        min_size: Option<usize>,
+        max_size: Option<usize>,
+        prefill_token_budget: u32,
+        token_budget: u32,
+        response_sender: oneshot::Sender<Option<NextBatch>>,
+        span: Span,
+    },
+}
+
+impl From<ValidParameters> for NextTokenChooserParameters {
+    fn from(value: ValidParameters) -> Self {
+        let (grammar, grammar_type) = match value.grammar {
+            None => (String::new(), GrammarType::None),
+
+            Some(grammar) => match grammar {
+                ValidGrammar::Json(grammar_string) => (grammar_string, GrammarType::Json),
+                ValidGrammar::Regex(grammar_string) => (grammar_string, GrammarType::Regex),
+            },
+        };
+
+        Self {
+            temperature: value.temperature,
+            top_k: value.top_k,
+            top_p: value.top_p,
+            typical_p: value.typical_p,
+            do_sample: value.do_sample,
+            seed: value.seed,
+            repetition_penalty: value.repetition_penalty,
+            frequency_penalty: value.frequency_penalty,
+            watermark: value.watermark,
+            grammar,
+            grammar_type: grammar_type.into(),
+        }
+    }
+}
+
+impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
+    fn from(value: ValidStoppingParameters) -> Self {
+        Self {
+            max_new_tokens: value.max_new_tokens,
+            stop_sequences: value.stop_sequences,
+            ignore_eos_token: value.ignore_eos_token,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tracing::info_span;
+
+    fn default_entry() -> (
+        Entry,
+        mpsc::UnboundedReceiver<Result<InferStreamResponse, InferError>>,
+    ) {
+        let (response_tx, receiver_tx) = mpsc::unbounded_channel();
+
+        let entry = Entry {
+            request: ValidGenerateRequest {
+                inputs: vec![],
+                input_length: 0,
+                truncate: 0,
+                decoder_input_details: false,
+                parameters: ValidParameters {
+                    temperature: 0.0,
+                    top_k: 0,
+                    top_p: 0.0,
+                    typical_p: 0.0,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 0.0,
+                    frequency_penalty: 0.0,
+                    watermark: false,
+                    grammar: None,
+                },
+                stopping_parameters: ValidStoppingParameters {
+                    ignore_eos_token: false,
+                    max_new_tokens: 1,
+                    stop_sequences: vec![],
+                },
+                top_n_tokens: 0,
+                adapter_id: None,
+            },
+            response_tx,
+            span: info_span!("entry"),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+            block_allocation: None,
+        };
+        (entry, receiver_tx)
+    }
+
+    #[tokio::test]
+    async fn test_append() {
+        let mut state = State::new(false, 1, None, 0, 16);
+        let (entry, _guard) = default_entry();
+
+        assert_eq!(state.next_id, 0);
+        assert_eq!(state.entries.len(), 0);
+
+        state.append(entry);
+
+        assert_eq!(state.next_id, 1);
+        assert_eq!(state.entries.len(), 1);
+        let (id, _) = state.entries.remove(0).unwrap();
+        assert_eq!(id, 0);
+    }
+
+    #[tokio::test]
+    async fn test_next_batch_empty() {
+        let mut state = State::new(false, 1, None, 0, 16);
+
+        assert!(state.next_batch(None, None, 1, 1).await.is_none());
+        assert!(state.next_batch(Some(1), None, 1, 1).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_next_batch_min_size() {
+        let mut state = State::new(false, 1, None, 0, 16);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, None, 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert!(entries.get(&1).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 0);
+        assert_eq!(state.next_batch_id, 1);
+
+        let (entry3, _guard3) = default_entry();
+        state.append(entry3);
+
+        assert!(state.next_batch(Some(2), None, 2, 2).await.is_none());
+
+        assert_eq!(state.next_id, 3);
+        assert_eq!(state.entries.len(), 1);
+        let (id, _) = state.entries.remove(0).unwrap();
+        assert_eq!(id, 2);
+    }
+
+    #[tokio::test]
+    async fn test_next_batch_max_size() {
+        let mut state = State::new(false, 1, None, 0, 16);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 1);
+        assert_eq!(state.next_batch_id, 1);
+    }
+
+    #[tokio::test]
+    async fn test_next_batch_token_budget() {
+        let mut state = State::new(false, 1, None, 0, 2);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, None, 1, 1).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 1);
+        assert_eq!(state.next_batch_id, 1);
+
+        let (entry3, _guard3) = default_entry();
+        state.append(entry3);
+
+        let (entries, batch, _) = state.next_batch(None, None, 3, 3).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&1));
+        assert!(entries.contains_key(&2));
+        assert_eq!(batch.id, 1);
+        assert_eq!(batch.size, 2);
+
+        assert_eq!(state.next_id, 3);
+        assert_eq!(state.entries.len(), 0);
+        assert_eq!(state.next_batch_id, 2);
+    }
+
+    #[tokio::test]
+    async fn test_queue_append() {
+        let queue = Queue::new(false, 1, None, 0, 16);
+        let (entry, _guard) = default_entry();
+        queue.append(entry);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_empty() {
+        let queue = Queue::new(false, 1, None, 0, 16);
+
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_min_size() {
+        let queue = Queue::new(false, 1, None, 0, 16);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert!(entries.get(&1).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+
+        let (entry3, _guard3) = default_entry();
+        queue.append(entry3);
+
+        // Not enough requests pending
+        assert!(queue.next_batch(Some(2), None, 2, 2).await.is_none());
+        // Not enough token budget
+        assert!(queue.next_batch(Some(1), None, 0, 0).await.is_none());
+        // Ok
+        let (entries2, batch2, _) = queue.next_batch(Some(1), None, 2, 2).await.unwrap();
+        assert_eq!(entries2.len(), 1);
+        assert!(entries2.contains_key(&2));
+        assert!(entries2.get(&2).unwrap().batch_time.is_some());
+        assert_eq!(batch2.id, 1);
+        assert_eq!(batch2.size, 1);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_max_size() {
+        let queue = Queue::new(false, 1, None, 0, 16);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, Some(1), 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_token_budget() {
+        let queue = Queue::new(false, 1, None, 0, 16);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 1, 1).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        let (entry3, _guard3) = default_entry();
+        queue.append(entry3);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 3, 3).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&1));
+        assert!(entries.contains_key(&2));
+        assert_eq!(batch.id, 1);
+        assert_eq!(batch.size, 2);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_token_speculate() {
+        let queue = Queue::new(false, 1, None, 2, 16);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        // Budget of 1 is not enough
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+
+        let (entries, batch, _) = queue.next_batch(None, None, 6, 6).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_dropped_receiver() {
+        let queue = Queue::new(false, 1, None, 0, 16);
+        let (entry, _) = default_entry();
+        queue.append(entry);
+
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+    }
+}
diff --git a/router/src/infer/v3/scheduler.rs b/router/src/infer/v3/scheduler.rs
new file mode 100644
index 00000000..543ce89f
--- /dev/null
+++ b/router/src/infer/v3/scheduler.rs
@@ -0,0 +1,1190 @@
+/// Batching and inference logic
+use crate::infer::v3::queue::{Entry, Queue};
+use crate::infer::{
+    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
+};
+use crate::validation::ValidGenerateRequest;
+use crate::{FinishReason, PrefillToken, Token};
+use nohash_hasher::IntMap;
+use std::sync::{
+    atomic::{AtomicBool, Ordering},
+    Arc,
+};
+use text_generation_client::v3::{Batch, CachedBatch, Generation, ShardedClient};
+use text_generation_client::ClientError;
+use tokio::sync::mpsc::error::SendError;
+use tokio::sync::{mpsc, Notify, OwnedSemaphorePermit};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{info_span, instrument, Instrument, Span};
+
+pub(crate) struct SchedulerV3 {
+    /// Request queue
+    queue: Queue,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
+}
+
+impl SchedulerV3 {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        client: ShardedClient,
+        waiting_served_ratio: f32,
+        max_batch_prefill_tokens: u32,
+        max_batch_total_tokens: u32,
+        max_waiting_tokens: usize,
+        max_batch_size: Option<usize>,
+        requires_padding: bool,
+        window_size: Option<u32>,
+        speculate: u32,
+        generation_health: Arc<AtomicBool>,
+    ) -> Self {
+        let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
+            matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
+        } else {
+            false
+        };
+        let block_size = if flashdecoding { 256 } else { 16 };
+        let queue = Queue::new(
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+            max_batch_total_tokens,
+        );
+        let batching_task_notifier = Arc::new(Notify::new());
+
+        // Spawn batching background task that contains all the inference logic
+        tokio::spawn(batching_task(
+            client,
+            waiting_served_ratio,
+            max_batch_prefill_tokens,
+            max_batch_total_tokens,
+            max_waiting_tokens,
+            max_batch_size,
+            queue.clone(),
+            batching_task_notifier.clone(),
+            generation_health,
+        ));
+
+        Self {
+            queue,
+            batching_task_notifier,
+        }
+    }
+}
+
+impl Scheduler for SchedulerV3 {
+    #[instrument(skip_all)]
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+        permit: OwnedSemaphorePermit,
+    ) -> Result<GenerateStreamResponse, InferError> {
+        // MPSC channel to communicate with the background batching task
+        let (response_tx, response_rx) = mpsc::unbounded_channel();
+        let input_length = request.input_length;
+
+        // Append the request to the queue
+        self.queue.append(Entry {
+            request,
+            response_tx,
+            span: Span::current(),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+            block_allocation: None,
+        });
+
+        // Notify the background task that we have a new entry in the queue that needs
+        // to be batched
+        self.batching_task_notifier.notify_one();
+
+        // Return stream
+        Ok((
+            permit,
+            input_length,
+            UnboundedReceiverStream::new(response_rx),
+        ))
+    }
+}
+
+/// Batching logic
+/// Will be launched in a background Tokio task
+///
+/// Batches requests and sends them to the inference server
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn batching_task(
+    mut client: ShardedClient,
+    waiting_served_ratio: f32,
+    max_batch_prefill_tokens: u32,
+    max_batch_total_tokens: u32,
+    max_waiting_tokens: usize,
+    max_batch_size: Option<usize>,
+    queue: Queue,
+    notifier: Arc<Notify>,
+    generation_health: Arc<AtomicBool>,
+) {
+    // Infinite loop
+    loop {
+        // Wait for a notification from the Infer struct
+        notifier.notified().await;
+
+        // Get the next batch from the queue
+        // This batch might be smaller than the maximum batch size if there are not enough requests
+        // waiting in the queue
+        while let Some((mut entries, batch, span)) = queue
+            .next_batch(
+                None,
+                max_batch_size,
+                max_batch_prefill_tokens,
+                max_batch_total_tokens,
+            )
+            .await
+        {
+            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
+                .instrument(span)
+                .await;
+            let mut waiting_tokens = 1;
+
+            // We loop until we do not receive any cached batch from the inference server (== until
+            // all requests have met their stopping criteria)
+            while let Some(batch) = cached_batch {
+                // Get current batch info
+                let batch_size = batch.size;
+                let batch_max_tokens = batch.max_tokens;
+                let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
+                metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
+
+                let min_size = if waiting_tokens >= max_waiting_tokens {
+                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                    // to add a new batch even though its size might be small
+                    None
+                } else {
+                    // Minimum batch size
+                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                };
+
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
+
+                // Try to get a new batch
+                if let Some((mut new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
+                    .await
+                {
+                    // Tracking metrics
+                    if min_size.is_some() {
+                        metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
+                    } else {
+                        metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
+                    }
+
+                    entries.iter_mut().for_each(|(_, entry)| {
+                        // Create a new span to add the info that this entry is waiting
+                        // because a new batch is being computed
+                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                        // Add relationships
+                        span.follows_from(&entry_waiting_span);
+                        entry_waiting_span.follows_from(&span);
+                        // Update entry
+                        entry.temp_span = Some(entry_waiting_span);
+                    });
+
+                    // Generate one token for this new batch to have the attention past in cache
+                    let new_cached_batch =
+                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
+                            .instrument(span)
+                            .await;
+                    // Reset waiting counter
+                    waiting_tokens = 1;
+                    // Extend current batch with the new batch
+                    if let Some(new_cached_batch) = new_cached_batch {
+                        entries.extend(new_entries);
+                        batches.push(new_cached_batch);
+                    }
+                }
+
+                // Create span for this batch to add context to inference calls
+                let next_batch_size = entries.len();
+                let next_batch_span =
+                    info_span!(parent: None, "batch", batch_size = next_batch_size);
+                entries.iter_mut().for_each(|(_, entry)| {
+                    // Create a new span to link the batch back to this entry
+                    let entry_batch_span = info_span!(parent: &entry.span, "infer");
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
+                    entry_batch_span.follows_from(&next_batch_span);
+                    // Update entry
+                    entry.temp_span = Some(entry_batch_span);
+                });
+
+                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
+                    .instrument(next_batch_span)
+                    .await;
+                waiting_tokens += 1;
+            }
+            metrics::gauge!("tgi_batch_current_size", 0.0);
+            metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
+    entries: &mut IntMap<u64, Entry>,
+    generation_health: &Arc<AtomicBool>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_id = batch.id;
+    metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
+
+    match client.prefill(batch).await {
+        Ok((generations, next_batch, timings)) => {
+            // Update health
+            generation_health.store(true, Ordering::SeqCst);
+
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
+            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
+            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
+            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
+            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            // Update health
+            generation_health.store(false, Ordering::SeqCst);
+            let _ = client.clear_cache(Some(batch_id)).await;
+            send_errors(err, entries);
+            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<CachedBatch>,
+    entries: &mut IntMap<u64, Entry>,
+    generation_health: &Arc<AtomicBool>,
+) -> Option<CachedBatch> {
+    let start_time = Instant::now();
+    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
+    metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch, timings)) => {
+            // Update health
+            generation_health.store(true, Ordering::SeqCst);
+
+            let start_filtering_time = Instant::now();
+            // Send generated tokens and filter stopped entries
+            filter_send_generations(generations, entries);
+
+            // Filter next batch and remove requests that were stopped
+            let next_batch = filter_batch(client, next_batch, entries).await;
+
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
+            }
+            metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
+            metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
+            metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
+            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
+            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            generation_health.store(false, Ordering::SeqCst);
+            for id in batch_ids {
+                let _ = client.clear_cache(Some(id)).await;
+            }
+            send_errors(err, entries);
+            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
+            None
+        }
+    }
+}
+
+/// Filter a `batch` and remove all requests not present in `entries`
+#[instrument(skip_all)]
+async fn filter_batch(
+    client: &mut ShardedClient,
+    next_batch: Option<CachedBatch>,
+    entries: &IntMap<u64, Entry>,
+) -> Option<CachedBatch> {
+    let mut batch = next_batch?;
+
+    // No need to filter
+    if batch.size as usize == entries.len() {
+        return Some(batch);
+    }
+
+    let id = batch.id;
+
+    // Retain only requests that are still in entries
+    batch.request_ids.retain(|id| entries.contains_key(id));
+
+    if batch.request_ids.is_empty() {
+        // All requests have been filtered out
+        // Next batch is now empty
+        // Clear it from the Python shards cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.clear_cache(Some(id)).await.unwrap();
+        None
+    } else {
+        // Filter Python shard cache
+        // We unwrap here as we need to panic since we cannot recover if this method fails
+        client.filter_batch(id, batch.request_ids).await.unwrap()
+    }
+}
+
+/// Send one or multiple `InferStreamResponse` to Infer for all `entries`
+/// and filter entries
+#[instrument(skip_all)]
+fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entry>) {
+    generations.into_iter().for_each(|generation| {
+        let id = generation.request_id;
+        // Get entry
+        // We can `expect` here as the request id should always be in the entries
+        let entry = entries
+            .get(&id)
+            .expect("ID not found in entries. This is a bug.");
+
+        // Create and enter a span to link this function back to the entry
+        let _span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_generation", generation = ?generation).entered();
+        // Send generation responses back to the infer task
+        // If the receive an error from the Flume channel, it means that the client dropped the
+        // request and we need to stop generating hence why we unwrap_or(true)
+        let stopped = send_responses(generation, entry).map_err(|err| {
+            tracing::error!("Entry response channel error.");
+            metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+            err
+        }).unwrap_or(true);
+        if stopped {
+            entries.remove(&id).expect("ID not found in entries. This is a bug.");
+        }
+    });
+}
+
+/// Send responses through the `entry` response channel
+fn send_responses(
+    generation: Generation,
+    entry: &Entry,
+) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
+    // Return directly if the channel is disconnected
+    if entry.response_tx.is_closed() {
+        metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+        return Ok(true);
+    }
+
+    let mut stopped = false;
+
+    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
+        // Send message
+        entry
+            .response_tx
+            .send(Ok(InferStreamResponse::Prefill(prefill_tokens)))?;
+    }
+
+    // Create last Token
+    let tokens_ = generation.tokens.expect("Non empty tokens in generation");
+    let n = tokens_.ids.len();
+    metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
+    let mut iterator = tokens_
+        .ids
+        .into_iter()
+        .zip(tokens_.logprobs)
+        .zip(tokens_.texts)
+        .zip(tokens_.is_special)
+        .enumerate()
+        .peekable();
+    while let Some((i, (((id, logprob), text), special))) = iterator.next() {
+        let token = Token {
+            id,
+            text,
+            logprob,
+            special,
+        };
+        let top_tokens = if let Some(top_tokens_) = generation.top_tokens.get(i) {
+            top_tokens_
+                .ids
+                .iter()
+                .zip(top_tokens_.logprobs.iter())
+                .zip(top_tokens_.texts.iter())
+                .zip(top_tokens_.is_special.iter())
+                .map(|(((&id, &logprob), text), &special)| Token {
+                    id,
+                    text: text.to_string(),
+                    logprob,
+                    special,
+                })
+                .collect()
+        } else {
+            vec![]
+        };
+        match (&generation.generated_text, iterator.peek()) {
+            (Some(generated_text), None) => {
+                // Generation has ended
+                stopped = true;
+                // Send message
+                entry.response_tx.send(Ok(InferStreamResponse::End {
+                    token,
+                    top_tokens,
+                    generated_text: GeneratedText::from(generated_text.clone()),
+                    queued: entry.queue_time,
+                    start: entry.batch_time.unwrap(),
+                }))?;
+            }
+            _ => {
+                // Send message
+                entry
+                    .response_tx
+                    .send(Ok(InferStreamResponse::Intermediate { token, top_tokens }))?;
+            }
+        }
+    }
+
+    Ok(stopped)
+}
+
+/// Send errors to Infer for all `entries`
+#[instrument(skip_all)]
+fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
+    entries.drain().for_each(|(_, entry)| {
+        // Create and enter a span to link this function back to the entry
+        let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
+        let err = InferError::GenerationError(error.to_string());
+        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
+        tracing::error!("{err}");
+
+        // unwrap_or is valid here as we don't care if the receiver is gone.
+        entry
+            .response_tx
+            .send(Err(err))
+            .unwrap_or(());
+    });
+}
+
+impl From<text_generation_client::v3::GeneratedText> for GeneratedText {
+    fn from(value: text_generation_client::v3::GeneratedText) -> Self {
+        let v3_finish_reason =
+            text_generation_client::v3::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v3_finish_reason {
+            text_generation_client::v3::FinishReason::Length => FinishReason::Length,
+            text_generation_client::v3::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            text_generation_client::v3::FinishReason::StopSequence => FinishReason::StopSequence,
+        };
+
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
+        }
+    }
+}
+
+// tests
+#[cfg(test)]
+mod tests {
+    use crate::infer::raise_exception;
+    use crate::{ChatTemplateInputs, TextMessage};
+    use minijinja::Environment;
+
+    #[test]
+    fn test_chat_template() {
+        let env = Environment::new();
+
+        let source = r#"
+        {% for message in messages %}
+            {% if message['role'] == 'system' %}
+                {% if message['content']%}
+                    {{'### System:\n' + message['content']+'\n\n'}}
+                {% endif %}
+            {% elif message['role'] == 'user' %}
+                {{'### User:\n' + message['content']+'\n\n'}}
+            {% elif message['role'] == 'assistant' %}
+                {{'### Assistant:\n'  + message['content']}}
+            {% endif %}
+            {% if loop.last and add_generation_prompt %}
+                {{ '### Assistant:\n' }}
+            {% endif %}
+        {% endfor %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+
+        assert_eq!(
+            result,
+            "### User:\nHi!\n\n### Assistant:\nHello how can I help?### User:\nWhat is Deep Learning?\n\n### Assistant:\nmagic!### Assistant:\n"
+        );
+    }
+
+    #[test]
+    fn test_chat_template_invalid_with_raise() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+
+        let source = r#"
+        {{ bos_token }}
+        {% for message in messages %}
+        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+        {% endif %}
+        {% if message['role'] == 'user' %}
+        {{ '[INST] ' + message['content'] + ' [/INST]' }}
+        {% elif message['role'] == 'assistant' %}
+        {{ message['content'] + eos_token}}
+        {% else %}
+        {{ raise_exception('Only user and assistant roles are supported!') }}
+        {% endif %}
+        {% endfor %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi again!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs); //.err().unwrap();
+
+        match result {
+            Ok(_) => panic!("Should have failed"),
+            Err(e) => {
+                assert_eq!(
+                    e.detail().unwrap(),
+                    "Conversation roles must alternate user/assistant/user/assistant/..."
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_chat_template_valid_with_raise() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+
+        let source = r#"
+        {{ bos_token }}
+        {% for message in messages %}
+        {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+        {% endif %}
+        {% if message['role'] == 'user' %}
+        {{ '[INST] ' + message['content'] + ' [/INST]' }}
+        {% elif message['role'] == 'assistant' %}
+        {{ message['content'] + eos_token}}
+        {% else %}
+        {{ raise_exception('Only user and assistant roles are supported!') }}
+        {% endif %}
+        {% endfor %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+        assert_eq!(result, "[BOS][INST] Hi! [/INST]Hello how can I help?[EOS][INST] What is Deep Learning? [/INST]magic![EOS]");
+    }
+
+    #[test]
+    fn test_chat_template_valid_with_add_generation_prompt() {
+        let mut env = Environment::new();
+        env.add_function("raise_exception", raise_exception);
+
+        let source = r#"
+        {% for message in messages %}
+        {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
+        {% endfor %}
+        {% if add_generation_prompt %}
+            {{ '<|im_start|>assistant\n' }}
+        {% endif %}"#;
+
+        // trim all the whitespace
+        let source = source
+            .lines()
+            .map(|line| line.trim())
+            .collect::<Vec<&str>>()
+            .join("");
+
+        let tmpl = env.template_from_str(&source);
+
+        let chat_template_inputs = ChatTemplateInputs {
+            messages: vec![
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "Hi!".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "Hello how can I help?".to_string(),
+                },
+                TextMessage {
+                    role: "user".to_string(),
+                    content: "What is Deep Learning?".to_string(),
+                },
+                TextMessage {
+                    role: "assistant".to_string(),
+                    content: "magic!".to_string(),
+                },
+            ],
+            bos_token: Some("[BOS]"),
+            eos_token: Some("[EOS]"),
+            add_generation_prompt: true,
+            ..Default::default()
+        };
+
+        let result = tmpl.unwrap().render(chat_template_inputs).unwrap();
+        assert_eq!(result, "<|im_start|>user\nHi!<|im_end|>\n<|im_start|>assistant\nHello how can I help?<|im_end|>\n<|im_start|>user\nWhat is Deep Learning?<|im_end|>\n<|im_start|>assistant\nmagic!<|im_end|>\n<|im_start|>assistant\n");
+    }
+
+    struct ChatTemplateTestItem {
+        name: &'static str,
+        chat_template: &'static str,
+        input: ChatTemplateInputs<'static>,
+        target: &'static str,
+    }
+
+    #[test]
+    fn test_many_chat_templates() {
+        let example_chat = vec![
+            TextMessage {
+                role: "user".to_string(),
+                content: "Hello, how are you?".to_string(),
+            },
+            TextMessage {
+                role: "assistant".to_string(),
+                content: "I'm doing great. How can I help you today?".to_string(),
+            },
+            TextMessage {
+                role: "user".to_string(),
+                content: "I'd like to show off how chat templating works!".to_string(),
+            },
+        ];
+
+        let example_chat_with_system = [TextMessage {
+            role: "system".to_string(),
+            content: "You are a friendly chatbot who always responds in the style of a pirate"
+                .to_string(),
+        }]
+        .iter()
+        .chain(&example_chat)
+        .cloned()
+        .collect::<Vec<_>>();
+
+        let test_default_templates = vec![
+            ChatTemplateTestItem {
+                name: "_base",
+                chat_template: "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some(""),
+                    ..Default::default()
+                },
+                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "blenderbot",
+                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "blenderbot_small",
+                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: " Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "bloom",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "gpt_neox",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
+            },
+            ChatTemplateTestItem {
+                name: "gpt2",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
+            },
+            ChatTemplateTestItem {
+                name: "llama",
+                // NOTE: the `.strip()` has been replaced with `| trim` in the following template
+                chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token +'[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content | trim + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: true,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>[INST] <<SYS>>\nYou are a friendly chatbot who always responds in the style of a pirate\n<</SYS>>\n\nHello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "whisper",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: true,
+                    bos_token: Some(""),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?<|endoftext|>I'm doing great. How can I help you today?<|endoftext|>I'd like to show off how chat templating works!<|endoftext|>",
+            },
+        ];
+
+        #[allow(unused_variables)] // name is unused
+        for ChatTemplateTestItem {
+            name,
+            chat_template,
+            input,
+            target,
+        } in test_default_templates
+        {
+            let mut env = Environment::new();
+            env.add_function("raise_exception", raise_exception);
+            let tmpl = env.template_from_str(chat_template);
+            let result = tmpl.unwrap().render(input).unwrap();
+            assert_eq!(result, target);
+        }
+
+        let test_custom_templates = vec![
+            ChatTemplateTestItem {
+                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=false)",
+                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHello, how are you?</s><|assistant|>\nI'm doing great. How can I help you today?</s><|user|>\nI'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "HuggingFaceH4/zephyr-7b-beta (add_generation_prompt=true)",
+                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: vec![
+                        TextMessage {
+                            role: "system".to_string(),
+                            content: "You are a friendly chatbot who always responds in the style of a pirate".to_string(),
+                        },
+                        TextMessage {
+                            role: "user".to_string(),
+                            content: "How many helicopters can a human eat in one sitting?".to_string(),
+                        },
+                    ],
+                    add_generation_prompt: true,
+                    bos_token: Some(""),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s><|user|>\nHow many helicopters can a human eat in one sitting?</s><|assistant|>",
+            },
+            ChatTemplateTestItem {
+                name: "HuggingFaceH4/zephyr-7b-gemma-v0.1",
+                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<bos>"),
+                    eos_token: Some("<eos>"),
+                    ..Default::default()
+                },
+                target: "<bos><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "mistralai/Mistral-7B-Instruct-v0.1",
+                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "cognitivecomputations/dolphin-2.5-mixtral-8x7b",
+                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "openchat/openchat-3.5-0106",
+                // `.title()` has been replaced with `| upper` in the following template
+                chat_template: "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + (message['role'] | title) + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>",
+            },
+            ChatTemplateTestItem {
+                name: "upstage/SOLAR-10.7B-Instruct-v1.0",
+                chat_template: "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you?</s>I'm doing great. How can I help you today?</s>I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "codellama/CodeLlama-70b-Instruct-hf",
+                // NOTE: `.strip()` has been replaced with `| trim` in the following template
+                chat_template: "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\\n\\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\\nDestination: user\\n\\n '}}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s>Source: user\n\n Hello, how are you? <step> Source: assistant\n\n I'm doing great. How can I help you today? <step> Source: user\n\n I'd like to show off how chat templating works! <step> Source: assistant\nDestination: user\n\n ",
+            },
+            ChatTemplateTestItem {
+                name: "Deci/DeciLM-7B-instruct",
+                chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\\n'  + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "### User:\nHello, how are you?### Assistant:\nI'm doing great. How can I help you today?### User:\nI'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "Qwen/Qwen1.5-72B-Chat",
+                chat_template: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "deepseek-ai/deepseek-llm-7b-chat",
+                chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\\n\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<｜begin▁of▁sentence｜>"),
+                    eos_token: Some("<｜end▁of▁sentence｜>"),
+                    ..Default::default()
+                },
+                target: "<｜begin▁of▁sentence｜>User: Hello, how are you?\n\nAssistant: I'm doing great. How can I help you today?<｜end▁of▁sentence｜>User: I'd like to show off how chat templating works!\n\n",
+            },
+            ChatTemplateTestItem {
+                name: "h2oai/h2o-danube-1.8b-chat",
+                chat_template: "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>'  + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<|prompt|>Hello, how are you?</s><|answer|>I'm doing great. How can I help you today?</s><|prompt|>I'd like to show off how chat templating works!</s>",
+            },
+            ChatTemplateTestItem {
+                name: "internlm/internlm2-chat-7b",
+                chat_template: "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "<s><|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n",
+            },
+            ChatTemplateTestItem {
+                name: "TheBloke/deepseek-coder-33B-instruct-AWQ",
+                chat_template: "{%- set found_item = false -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set found_item = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<｜begin▁of▁sentence｜>"),
+                    eos_token: Some("<|EOT|>"),
+                    ..Default::default()
+                },
+                target: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n### Response:\n",
+            },
+            ChatTemplateTestItem {
+                name: "ericzzz/falcon-rw-1b-chat",
+                // `.strip()` has been replaced with `| trim` in the following template
+                chat_template: "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'] | trim }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim }}{% elif message['role'] == 'assistant' %}{{ '[RESP] '  + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<|endoftext|>"),
+                    eos_token: Some("<|endoftext|>"),
+                    ..Default::default()
+                },
+                target: "[INST] Hello, how are you? [RESP] I'm doing great. How can I help you today?<|endoftext|>[INST] I'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "abacusai/Smaug-34B-v0.1",
+                chat_template: "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Hello, how are you? [/INST] I'm doing great. How can I help you today? </s><s>[INST] I'd like to show off how chat templating works! [/INST]",
+            },
+            ChatTemplateTestItem {
+                name: "maywell/Synatra-Mixtral-8x7B",
+                chat_template: "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "Below is an instruction that describes a task. Write a response that appropriately completes the request.### Instruction:Hello, how are you?### Response:I'm doing great. How can I help you today?### Instruction:I'd like to show off how chat templating works!",
+            },
+            ChatTemplateTestItem {
+                name: "deepseek-ai/deepseek-coder-33b-instruct",
+                chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<｜begin▁of▁sentence｜>"),
+                    eos_token: Some("</EOT>"),
+                    ..Default::default()
+                },
+                target: "<｜begin▁of▁sentence｜>You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n### Instruction:\nHello, how are you?\n### Response:\nI'm doing great. How can I help you today?\n<|EOT|>\n### Instruction:\nI'd like to show off how chat templating works!\n",
+            },
+            // NOT INCLUDED
+            // - meetkai/functionary-medium-v3.2
+            // - fireworks-ai/firefunction-v1
+            // https://github
+            ChatTemplateTestItem {
+                name: "maywell/PiVoT-MoE",
+                chat_template: "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content']|trim }}{% elif message['role'] == 'user' %}### Instruction: {{ message['content']|trim }}{% elif message['role'] == 'assistant' %}### Response: {{ message['content']|trim }}{% elif message['role'] == 'user_context' %}### Input: {{ message['content']|trim }}{% endif %}{% if not loop.last %}\n{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}### Response:{% endif %}",
+                input: ChatTemplateInputs {
+                    messages: example_chat_with_system.clone(),
+                    add_generation_prompt: false,
+                    bos_token: Some("<s>"),
+                    eos_token: Some("</s>"),
+                    ..Default::default()
+                },
+                target: "You are a friendly chatbot who always responds in the style of a pirateYou are a friendly chatbot who always responds in the style of a pirate### Instruction: Hello, how are you?### Response: I'm doing great. How can I help you today?### Instruction: I'd like to show off how chat templating works!",
+            },
+        ];
+
+        #[allow(unused_variables)] // name is unused
+        for ChatTemplateTestItem {
+            name,
+            chat_template,
+            input,
+            target,
+        } in test_custom_templates
+        {
+            let mut env = Environment::new();
+            env.add_function("raise_exception", raise_exception);
+            // trim all the whitespace
+            let chat_template = chat_template
+                .lines()
+                .map(|line| line.trim())
+                .collect::<Vec<&str>>()
+                .join("");
+
+            let tmpl = env.template_from_str(&chat_template);
+            let result = tmpl.unwrap().render(input).unwrap();
+            assert_eq!(result, target);
+        }
+    }
+}
diff --git a/router/src/kserve.rs b/router/src/kserve.rs
new file mode 100644
index 00000000..c53fa481
--- /dev/null
+++ b/router/src/kserve.rs
@@ -0,0 +1,245 @@
+use crate::infer::Infer;
+use crate::{
+    default_parameters,
+    server::{generate_internal, ComputeType},
+    Deserialize, ErrorResponse, GenerateParameters, GenerateRequest, Serialize, ToSchema,
+};
+use axum::extract::{Extension, Path};
+use axum::http::{HeaderMap, StatusCode};
+use axum::response::IntoResponse;
+use axum::Json;
+use futures::stream::FuturesUnordered;
+use futures::TryStreamExt;
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct OutputChunk {
+    pub name: String,
+    pub shape: Vec<usize>,
+    pub datatype: String,
+    pub data: Vec<u8>,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct InferenceOutput {
+    pub id: String,
+    pub outputs: Vec<OutputChunk>,
+}
+
+#[derive(Debug, Deserialize, ToSchema)]
+pub(crate) struct InferenceRequest {
+    pub id: String,
+    #[serde(default = "default_parameters")]
+    pub parameters: GenerateParameters,
+    pub inputs: Vec<Input>,
+    pub outputs: Vec<Output>,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub(crate) struct Input {
+    pub name: String,
+    pub shape: Vec<usize>,
+    pub datatype: String,
+    pub data: Vec<u8>,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub(crate) struct Output {
+    pub name: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct LiveResponse {
+    pub live: bool,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct ReadyResponse {
+    pub live: bool,
+}
+
+#[derive(Debug, Serialize, Deserialize, ToSchema)]
+pub struct MetadataServerResponse {
+    pub name: String,
+    pub version: String,
+    pub extensions: Vec<String>,
+}
+
+#[utoipa::path(
+    post,
+    tag = "Text Generation Inference",
+    path = "/v2/health/live",
+    responses(
+        (status = 200, description = "Service is live", body = LiveReponse),
+        (status = 404, description = "Service not found", body = ErrorResponse,
+            example = json!({"error": "No response"}))
+    )
+)]
+pub async fn kserve_health_live() -> Json<LiveResponse> {
+    let data = LiveResponse { live: true };
+    Json(data)
+}
+
+#[utoipa::path(
+    get,
+    tag = "Text Generation Inference",
+    path = "/v2/health/ready",
+    responses(
+        (status = 200, description = "Service is ready", body = ReadyResponse),
+        (status = 404, description = "Service not found", body = ErrorResponse,
+            example = json!({"error": "No response"}))
+    )
+)]
+pub async fn kserve_health_ready() -> Json<ReadyResponse> {
+    let data = ReadyResponse { live: true };
+    Json(data)
+}
+
+#[utoipa::path(
+    get,
+    tag = "Text Generation Inference",
+    path = "/v2",
+    responses(
+        (status = 200, description = "Metadata retrieved", body = MetadataServerResponse),
+        (status = 404, description = "Service not found", body = ErrorResponse,
+            example = json!({"error": "No response"}))
+    )
+)]
+pub async fn kerve_server_metadata() -> Json<MetadataServerResponse> {
+    let data = MetadataServerResponse {
+        name: "text-generation-inference".to_string(),
+        version: env!("CARGO_PKG_VERSION").to_string(),
+        extensions: vec![
+            "health".to_string(),
+            "models".to_string(),
+            "metrics".to_string(),
+        ],
+    };
+    Json(data)
+}
+
+#[utoipa::path(
+    get,
+    tag = "Text Generation Inference",
+    path = "/v2/models/{model_name}/versions/{model_version}",
+    responses(
+        (status = 200, description = "Model version metadata retrieved", body = MetadataServerResponse),
+        (status = 404, description = "Model or version not found", body = ErrorResponse,
+            example = json!({"error": "No response"}))
+    )
+)]
+pub async fn kserve_model_metadata(
+    Path((model_name, model_version)): Path<(String, String)>,
+) -> Json<MetadataServerResponse> {
+    let data = MetadataServerResponse {
+        name: model_name,
+        version: model_version,
+        extensions: vec!["infer".to_string(), "ready".to_string()],
+    };
+    Json(data)
+}
+
+#[utoipa::path(
+    get,
+    tag = "Text Generation Inference",
+    path = "/v2/models/{model_name}/versions/{model_version}/ready",
+    responses(
+        (status = 200, description = "Model version is ready", body = ReadyResponse),
+        (status = 404, description = "Model or version not found", body = ErrorResponse,
+            example = json!({"error": "No response"}))
+    )
+)]
+pub async fn kserve_model_metadata_ready(
+    Path((_model_name, _model_version)): Path<(String, String)>,
+) -> Json<ReadyResponse> {
+    let data = ReadyResponse { live: true };
+    Json(data)
+}
+
+#[utoipa::path(
+    post,
+    tag = "Text Generation Inference",
+    path = "/v2/models/{model_name}/versions/{model_version}/infer",
+    request_body = Json<InferenceRequest>,
+    responses(
+        (status = 200, description = "Inference executed successfully", body = InferenceOutput),
+        (status = 404, description = "Model or version not found", body = ErrorResponse,
+            example = json!({"error": "No response"}))
+    )
+)]
+pub async fn kserve_model_infer(
+    infer: Extension<Infer>,
+    Extension(compute_type): Extension<ComputeType>,
+    Json(payload): Json<InferenceRequest>,
+) -> Result<impl IntoResponse, (StatusCode, Json<ErrorResponse>)> {
+    let id = payload.id.clone();
+    let str_inputs = payload
+        .inputs
+        .iter()
+        .map(|input| {
+            std::str::from_utf8(&input.data).map_err(|e| {
+                (
+                    StatusCode::UNPROCESSABLE_ENTITY,
+                    Json(ErrorResponse {
+                        error: e.to_string(),
+                        error_type: "utf8".to_string(),
+                    }),
+                )
+            })
+        })
+        .collect::<Result<Vec<_>, _>>()?;
+
+    if str_inputs.len() != payload.outputs.len() {
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: "Inputs and outputs length mismatch".to_string(),
+                error_type: "length mismatch".to_string(),
+            }),
+        ));
+    }
+
+    let output_chunks = str_inputs
+        .iter()
+        .zip(&payload.outputs)
+        .map(|(str_input, output)| {
+            let generate_request = GenerateRequest {
+                inputs: str_input.to_string(),
+                parameters: payload.parameters.clone(),
+            };
+            let infer = infer.clone();
+            let compute_type = compute_type.clone();
+            let span = tracing::Span::current();
+            async move {
+                generate_internal(infer, compute_type, Json(generate_request), span)
+                    .await
+                    .map(|(_, Json(generation))| {
+                        let generation_as_bytes = generation.generated_text.as_bytes().to_vec();
+                        OutputChunk {
+                            name: output.name.clone(),
+                            shape: vec![1, generation_as_bytes.len()],
+                            datatype: "BYTES".to_string(),
+                            data: generation_as_bytes,
+                        }
+                    })
+                    .map_err(|_| {
+                        (
+                            StatusCode::INTERNAL_SERVER_ERROR,
+                            Json(ErrorResponse {
+                                error: "Incomplete generation".into(),
+                                error_type: "Incomplete generation".into(),
+                            }),
+                        )
+                    })
+            }
+        })
+        .collect::<FuturesUnordered<_>>()
+        .try_collect::<Vec<_>>()
+        .await?;
+
+    let inference_output = InferenceOutput {
+        id: id.clone(),
+        outputs: output_chunks,
+    };
+
+    Ok((HeaderMap::new(), Json(inference_output)))
+}
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 76e70bb7..165b2ad2 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -1,16 +1,36 @@
-mod health;
 /// Text Generation Inference Webserver
+pub mod config;
 mod infer;
-mod queue;
 pub mod server;
 mod validation;
 
-use infer::Infer;
-use queue::{Entry, Queue};
+#[cfg(feature = "kserve")]
+mod kserve;
+
 use serde::{Deserialize, Serialize};
+use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;
 
+#[derive(Clone, Deserialize, ToSchema)]
+pub(crate) struct VertexInstance {
+    #[schema(example = "What is Deep Learning?")]
+    pub inputs: String,
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub parameters: Option<GenerateParameters>,
+}
+
+#[derive(Deserialize, ToSchema)]
+pub(crate) struct VertexRequest {
+    #[serde(rename = "instances")]
+    pub instances: Vec<VertexInstance>,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize)]
+pub(crate) struct VertexResponse {
+    pub predictions: Vec<String>,
+}
+
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
 pub struct HubModelInfo {
@@ -20,6 +40,105 @@ pub struct HubModelInfo {
     pub pipeline_tag: Option<String>,
 }
 
+#[derive(Debug, Clone, Deserialize, PartialEq)]
+pub struct ChatTemplate {
+    name: String,
+    template: String,
+}
+
+#[derive(Debug, Clone, Deserialize, PartialEq)]
+#[serde(untagged)]
+pub enum ChatTemplateVersions {
+    Single(String),
+    Multiple(Vec<ChatTemplate>),
+}
+
+use std::path::Path;
+
+#[derive(Debug, Clone, Deserialize, Default)]
+pub struct HubTokenizerConfig {
+    pub chat_template: Option<ChatTemplateVersions>,
+    pub completion_template: Option<String>,
+    pub bos_token: Option<TokenizerConfigToken>,
+    pub eos_token: Option<TokenizerConfigToken>,
+    pub tokenizer_class: Option<String>,
+    pub add_bos_token: Option<bool>,
+    pub add_eos_token: Option<bool>,
+}
+
+impl HubTokenizerConfig {
+    pub fn from_file<P: AsRef<Path>>(filename: P) -> Option<Self> {
+        std::fs::read_to_string(filename)
+            .ok()
+            .and_then(|content| serde_json::from_str(&content).ok())
+    }
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
+#[serde(untagged)]
+pub enum TokenizerConfigToken {
+    String(String),
+    Object { content: String },
+}
+
+impl TokenizerConfigToken {
+    pub fn as_str(&self) -> &str {
+        match self {
+            TokenizerConfigToken::String(s) => s,
+            TokenizerConfigToken::Object { content } => content,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "processor_class")]
+pub enum HubPreprocessorConfig {
+    Idefics2Processor(Idefics2Preprocessor),
+}
+
+impl HubPreprocessorConfig {
+    pub fn from_file<P: AsRef<std::path::Path>>(filename: P) -> Option<Self> {
+        let content = std::fs::read_to_string(filename).ok()?;
+        serde_json::from_str(&content).ok()
+    }
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct Idefics2Preprocessor {
+    #[serde(default)]
+    do_image_splitting: bool,
+}
+
+#[derive(Debug, Clone, Deserialize, Default)]
+pub struct HubProcessorConfig {
+    pub chat_template: Option<ChatTemplateVersions>,
+    pub image_seq_len: usize,
+    pub processor_class: Option<String>,
+}
+
+impl HubProcessorConfig {
+    pub fn from_file<P: AsRef<Path>>(filename: P) -> Option<Self> {
+        std::fs::read_to_string(filename)
+            .ok()
+            .and_then(|content| serde_json::from_str(&content).ok())
+    }
+}
+
+#[derive(Clone, Debug, Deserialize, ToSchema, Serialize)]
+#[serde(tag = "type", content = "value")]
+pub(crate) enum GrammarType {
+    /// A string that represents a [JSON Schema](https://json-schema.org/).
+    ///
+    /// JSON Schema is a declarative language that allows to annotate JSON documents
+    /// with types and descriptions.
+    #[serde(rename = "json")]
+    #[serde(alias = "json_object")]
+    #[schema(example = json ! ({"properties": {"location":{"type": "string"}}}))]
+    Json(serde_json::Value),
+    #[serde(rename = "regex")]
+    Regex(String),
+}
+
 #[derive(Clone, Debug, Serialize, ToSchema)]
 pub struct Info {
     /// Model info
@@ -41,7 +160,7 @@ pub struct Info {
     #[schema(example = "4")]
     pub max_stop_sequences: usize,
     #[schema(example = "1024")]
-    pub max_input_length: usize,
+    pub max_input_tokens: usize,
     #[schema(example = "2048")]
     pub max_total_tokens: usize,
     #[schema(example = "1.2")]
@@ -50,9 +169,15 @@ pub struct Info {
     pub max_batch_total_tokens: u32,
     #[schema(example = "20")]
     pub max_waiting_tokens: usize,
+    #[schema(nullable = true, example = "null")]
+    pub max_batch_size: Option<usize>,
     #[schema(example = "2")]
     pub validation_workers: usize,
+    #[schema(example = "32")]
+    pub max_client_batch_size: usize,
     /// Router Info
+    #[schema(example = "text-generation-router")]
+    pub router: &'static str,
     #[schema(example = "0.5.0")]
     pub version: &'static str,
     #[schema(nullable = true, example = "null")]
@@ -61,11 +186,14 @@ pub struct Info {
     pub docker_label: Option<&'static str>,
 }
 
-#[derive(Clone, Debug, Deserialize, ToSchema)]
+#[derive(Clone, Debug, Deserialize, ToSchema, Default)]
 pub(crate) struct GenerateParameters {
+    /// Generate best_of sequences and return the one if the highest token logprobs.
     #[serde(default)]
     #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 1)]
     pub best_of: Option<usize>,
+
+    /// The value used to module the logits distribution.
     #[serde(default)]
     #[schema(
         exclusive_minimum = 0.0,
@@ -74,6 +202,9 @@ pub(crate) struct GenerateParameters {
         example = 0.5
     )]
     pub temperature: Option<f32>,
+
+    /// The parameter for repetition penalty. 1.0 means no penalty.
+    /// See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     #[serde(default)]
     #[schema(
         exclusive_minimum = 0.0,
@@ -82,9 +213,25 @@ pub(crate) struct GenerateParameters {
         example = 1.03
     )]
     pub repetition_penalty: Option<f32>,
+
+    /// The parameter for frequency penalty. 1.0 means no penalty
+    /// Penalize new tokens based on their existing frequency in the text so far,
+    /// decreasing the model's likelihood to repeat the same line verbatim.
+    #[serde(default)]
+    #[schema(
+        exclusive_minimum = -2.0,
+        nullable = true,
+        default = "null",
+        example = 0.1
+    )]
+    pub frequency_penalty: Option<f32>,
+
+    /// The number of highest probability vocabulary tokens to keep for top-k-filtering.
     #[serde(default)]
     #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 10)]
     pub top_k: Option<i32>,
+
+    /// Top-p value for nucleus sampling.
     #[serde(default)]
     #[schema(
         exclusive_minimum = 0.0,
@@ -94,6 +241,9 @@ pub(crate) struct GenerateParameters {
         example = 0.95
     )]
     pub top_p: Option<f32>,
+
+    /// Typical Decoding mass
+    /// See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.
     #[serde(default)]
     #[schema(
         exclusive_minimum = 0.0,
@@ -103,30 +253,48 @@ pub(crate) struct GenerateParameters {
         example = 0.95
     )]
     pub typical_p: Option<f32>,
+
+    /// Activate logits sampling.
     #[serde(default)]
     #[schema(default = "false", example = true)]
     pub do_sample: bool,
+
+    /// Maximum number of tokens to generate.
     #[serde(default = "default_max_new_tokens")]
-    #[schema(exclusive_minimum = 0, exclusive_maximum = 512, default = "20")]
-    pub max_new_tokens: u32,
+    #[schema(nullable = true, default = "100", example = "20")]
+    pub max_new_tokens: Option<u32>,
+
+    /// Whether to prepend the prompt to the generated text
     #[serde(default)]
     #[schema(nullable = true, default = "null", example = false)]
     pub return_full_text: Option<bool>,
+
+    /// Stop generating tokens if a member of `stop` is generated.
     #[serde(default)]
     #[schema(inline, max_items = 4, example = json ! (["photographer"]))]
     pub stop: Vec<String>,
+
+    /// Truncate inputs tokens to the given size.
     #[serde(default)]
     #[schema(nullable = true, default = "null", example = "null")]
     pub truncate: Option<usize>,
+
+    /// Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226).
     #[serde(default)]
     #[schema(default = "false", example = true)]
     pub watermark: bool,
+
+    /// Whether to return generation details.
     #[serde(default)]
     #[schema(default = "true")]
     pub details: bool,
+
+    /// Whether to return decoder input token logprobs and ids.
     #[serde(default)]
-    #[schema(default = "true")]
+    #[schema(default = "false")]
     pub decoder_input_details: bool,
+
+    /// Random sampling seed.
     #[serde(default)]
     #[schema(
         exclusive_minimum = 0,
@@ -135,13 +303,25 @@ pub(crate) struct GenerateParameters {
         example = "null"
     )]
     pub seed: Option<u64>,
+
+    /// The number of highest probability vocabulary tokens to keep for top-n-filtering.
     #[serde(default)]
     #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 5)]
     pub top_n_tokens: Option<u32>,
+
+    /// Grammar constraints for the generation.
+    #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub grammar: Option<GrammarType>,
+
+    /// Lora adapter id
+    #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub adapter_id: Option<String>,
 }
 
-fn default_max_new_tokens() -> u32 {
-    20
+fn default_max_new_tokens() -> Option<u32> {
+    Some(100)
 }
 
 fn default_parameters() -> GenerateParameters {
@@ -149,10 +329,11 @@ fn default_parameters() -> GenerateParameters {
         best_of: None,
         temperature: None,
         repetition_penalty: None,
+        frequency_penalty: None,
         top_k: None,
         top_p: None,
         typical_p: None,
-        do_sample: false,
+        do_sample: true,
         max_new_tokens: default_max_new_tokens(),
         return_full_text: None,
         stop: Vec::new(),
@@ -162,9 +343,698 @@ fn default_parameters() -> GenerateParameters {
         decoder_input_details: false,
         seed: None,
         top_n_tokens: None,
+        grammar: None,
+        adapter_id: None,
     }
 }
 
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
+#[serde(try_from = "PromptDeserializer")]
+pub struct Prompt(pub Vec<String>);
+
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum PromptDeserializer {
+    Single(String),
+    Multiple(Vec<String>),
+}
+
+impl TryFrom<PromptDeserializer> for Prompt {
+    type Error = String;
+
+    fn try_from(value: PromptDeserializer) -> Result<Self, Self::Error> {
+        match value {
+            PromptDeserializer::Single(s) => Ok(Prompt(vec![s])),
+            PromptDeserializer::Multiple(v) => {
+                if v.is_empty() {
+                    Err(
+                        "Empty array detected. Do not use an empty array for the prompt."
+                            .to_string(),
+                    )
+                } else {
+                    Ok(Prompt(v))
+                }
+            }
+        }
+    }
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug)]
+pub struct CompletionRequest {
+    /// UNUSED
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
+    /// ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
+    pub model: String,
+
+    /// The prompt to generate completions for.
+    #[schema(example = "What is Deep Learning?")]
+    pub prompt: Prompt,
+
+    /// The maximum number of tokens that can be generated in the chat completion.
+    #[serde(default)]
+    #[schema(default = "32")]
+    pub max_tokens: Option<u32>,
+
+    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
+    /// lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.
+    #[serde(default)]
+    #[schema(nullable = true, example = 1.0)]
+    pub temperature: Option<f32>,
+
+    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
+    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+    #[serde(default)]
+    #[schema(nullable = true, example = 0.95)]
+    pub top_p: Option<f32>,
+
+    #[serde(default = "bool::default")]
+    pub stream: bool,
+
+    #[schema(nullable = true, example = 42)]
+    pub seed: Option<u64>,
+
+    /// The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.
+    /// please see the completion_template field in the model's tokenizer_config.json file for completion template.
+    #[serde(default)]
+    pub suffix: Option<String>,
+
+    #[serde(default)]
+    pub repetition_penalty: Option<f32>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
+    /// decreasing the model's likelihood to repeat the same line verbatim.
+    #[serde(default)]
+    #[schema(example = "1.0")]
+    pub frequency_penalty: Option<f32>,
+
+    /// Up to 4 sequences where the API will stop generating further tokens.
+    #[serde(default)]
+    #[schema(nullable = true, example = "null")]
+    pub stop: Option<Vec<String>>,
+}
+
+#[derive(Clone, Serialize, ToSchema)]
+#[serde(tag = "object")]
+enum Completion {
+    #[serde(rename = "text_completion")]
+    Chunk(Chunk),
+    #[serde(rename = "text_completion")]
+    Final(CompletionFinal),
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
+pub(crate) struct CompletionFinal {
+    pub id: String,
+    #[schema(example = "1706270835")]
+    pub created: u64,
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
+    pub model: String,
+    pub system_fingerprint: String,
+    pub choices: Vec<CompletionComplete>,
+    pub usage: Usage,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct CompletionComplete {
+    pub index: u32,
+    pub text: String,
+    pub logprobs: Option<Vec<f32>>,
+    pub finish_reason: String,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct Chunk {
+    pub id: String,
+    pub created: u64,
+    pub choices: Vec<CompletionComplete>,
+    pub model: String,
+    pub system_fingerprint: String,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct ChatCompletion {
+    pub id: String,
+    #[schema(example = "1706270835")]
+    pub created: u64,
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
+    pub model: String,
+    pub system_fingerprint: String,
+    pub choices: Vec<ChatCompletionComplete>,
+    pub usage: Usage,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct ChatCompletionComplete {
+    pub index: u32,
+    pub message: OutputMessage,
+    pub logprobs: Option<ChatCompletionLogprobs>,
+    pub finish_reason: String,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct ChatCompletionLogprobs {
+    content: Vec<ChatCompletionLogprob>,
+}
+
+impl From<(Token, Vec<Token>)> for ChatCompletionLogprobs {
+    fn from(value: (Token, Vec<Token>)) -> Self {
+        let (token, top_tokens) = value;
+
+        Self {
+            content: vec![ChatCompletionLogprob {
+                token: token.text,
+                logprob: token.logprob,
+                top_logprobs: top_tokens
+                    .into_iter()
+                    .map(|t| ChatCompletionTopLogprob {
+                        token: t.text,
+                        logprob: t.logprob,
+                    })
+                    .collect(),
+            }],
+        }
+    }
+}
+
+impl From<(Vec<Token>, Vec<Vec<Token>>)> for ChatCompletionLogprobs {
+    fn from(value: (Vec<Token>, Vec<Vec<Token>>)) -> Self {
+        let (tokens, top_tokens) = value;
+
+        // Create an iterator that produces None for top_tokens once it's exhausted
+        let top_tokens_iter = top_tokens
+            .into_iter()
+            .map(Some)
+            .chain(std::iter::repeat(None));
+
+        let content = tokens
+            .into_iter()
+            .zip(top_tokens_iter)
+            .map(|(t, top_t_option)| ChatCompletionLogprob {
+                token: t.text,
+                logprob: t.logprob,
+                top_logprobs: match top_t_option {
+                    Some(top_t) => top_t
+                        .into_iter()
+                        .map(|t| ChatCompletionTopLogprob {
+                            token: t.text,
+                            logprob: t.logprob,
+                        })
+                        .collect(),
+                    None => vec![], // Handle the case where there are no top tokens
+                },
+            })
+            .collect();
+
+        Self { content }
+    }
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct ChatCompletionLogprob {
+    token: String,
+    logprob: f32,
+    top_logprobs: Vec<ChatCompletionTopLogprob>,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
+pub(crate) struct ChatCompletionTopLogprob {
+    token: String,
+    logprob: f32,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
+pub(crate) struct Usage {
+    pub prompt_tokens: u32,
+    pub completion_tokens: u32,
+    pub total_tokens: u32,
+}
+
+#[derive(Clone, Serialize, ToSchema)]
+#[serde(tag = "object")]
+enum CompletionType {
+    #[serde(rename = "chat.completion.chunk")]
+    ChatCompletionChunk(ChatCompletionChunk),
+    #[serde(rename = "chat.completion")]
+    ChatCompletion(ChatCompletion),
+}
+
+impl ChatCompletion {
+    pub(crate) fn new(
+        model: String,
+        system_fingerprint: String,
+        output: Option<String>,
+        created: u64,
+        details: Details,
+        return_logprobs: bool,
+        tool_calls: Option<Vec<ToolCall>>,
+    ) -> Self {
+        let message = match (output, tool_calls) {
+            (Some(content), None) => OutputMessage::ChatMessage(TextMessage {
+                role: "assistant".into(),
+                content,
+            }),
+            (None, Some(tool_calls)) => OutputMessage::ToolCall(ToolCallMessage {
+                role: "assistant".to_string(),
+                tool_calls,
+            }),
+            (Some(output), Some(_)) => {
+                warn!("Received both chat and tool call");
+                OutputMessage::ChatMessage(TextMessage {
+                    role: "assistant".into(),
+                    content: output,
+                })
+            }
+            (None, None) => {
+                warn!("Didn't receive an answer");
+                OutputMessage::ChatMessage(TextMessage {
+                    role: "assistant".into(),
+                    content: "".to_string(),
+                })
+            }
+        };
+        Self {
+            id: String::new(),
+            created,
+            model,
+            system_fingerprint,
+            choices: vec![ChatCompletionComplete {
+                index: 0,
+                message,
+                logprobs: return_logprobs
+                    .then(|| ChatCompletionLogprobs::from((details.tokens, details.top_tokens))),
+                finish_reason: details.finish_reason.to_string(),
+            }],
+            usage: Usage {
+                prompt_tokens: details.prefill.len() as u32,
+                completion_tokens: details.generated_tokens,
+                total_tokens: details.prefill.len() as u32 + details.generated_tokens,
+            },
+        }
+    }
+}
+#[derive(Clone, Serialize, ToSchema)]
+pub(crate) struct ChatCompletionChunk {
+    pub id: String,
+    #[schema(example = "1706270978")]
+    pub created: u64,
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
+    pub model: String,
+    pub system_fingerprint: String,
+    pub choices: Vec<ChatCompletionChoice>,
+}
+
+#[derive(Clone, Serialize, ToSchema)]
+pub(crate) struct ChatCompletionChoice {
+    pub index: u32,
+    pub delta: ChatCompletionDelta,
+    pub logprobs: Option<ChatCompletionLogprobs>,
+    pub finish_reason: Option<String>,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+pub struct ToolCallDelta {
+    #[schema(example = "assistant")]
+    role: String,
+    tool_calls: DeltaToolCall,
+}
+
+#[derive(Clone, Debug, Serialize, ToSchema)]
+#[serde(untagged)]
+enum ChatCompletionDelta {
+    Chat(TextMessage),
+    Tool(ToolCallDelta),
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
+pub(crate) struct DeltaToolCall {
+    pub index: u32,
+    pub id: String,
+    pub r#type: String,
+    pub function: Function,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
+pub(crate) struct Function {
+    pub name: Option<String>,
+    pub arguments: String,
+}
+
+#[allow(clippy::too_many_arguments)]
+impl ChatCompletionChunk {
+    pub(crate) fn new(
+        model: String,
+        system_fingerprint: String,
+        delta: Option<String>,
+        tool_calls: Option<Vec<String>>,
+        created: u64,
+        logprobs: Option<ChatCompletionLogprobs>,
+        finish_reason: Option<String>,
+    ) -> Self {
+        let delta = match (delta, tool_calls) {
+            (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
+                role: "assistant".to_string(),
+                content: delta,
+            }),
+            (None, Some(tool_calls)) => ChatCompletionDelta::Tool(ToolCallDelta {
+                role: "assistant".to_string(),
+                tool_calls: DeltaToolCall {
+                    index: 0,
+                    id: String::new(),
+                    r#type: "function".to_string(),
+                    function: Function {
+                        name: None,
+                        arguments: tool_calls[0].to_string(),
+                    },
+                },
+            }),
+            (None, None) => ChatCompletionDelta::Chat(TextMessage {
+                role: "assistant".to_string(),
+                content: "".to_string(),
+            }),
+        };
+        Self {
+            id: String::new(),
+            created,
+            model,
+            system_fingerprint,
+            choices: vec![ChatCompletionChoice {
+                index: 0,
+                delta,
+                logprobs,
+                finish_reason,
+            }],
+        }
+    }
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize)]
+pub(crate) struct ChatRequest {
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
+    /// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
+    pub model: String,
+
+    /// A list of messages comprising the conversation so far.
+    #[schema(example = "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]")]
+    pub messages: Vec<Message>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
+    /// decreasing the model's likelihood to repeat the same line verbatim.
+    #[serde(default)]
+    #[schema(example = "1.0")]
+    pub frequency_penalty: Option<f32>,
+
+    /// UNUSED
+    /// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens
+    /// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,
+    /// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
+    /// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should
+    /// result in a ban or exclusive selection of the relevant token.
+    #[serde(default)]
+    pub logit_bias: Option<Vec<f32>>,
+
+    /// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each
+    /// output token returned in the content of message.
+    #[serde(default)]
+    #[schema(example = "false")]
+    pub logprobs: Option<bool>,
+
+    /// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with
+    /// an associated log probability. logprobs must be set to true if this parameter is used.
+    #[serde(default)]
+    #[schema(example = "5")]
+    pub top_logprobs: Option<u32>,
+
+    /// The maximum number of tokens that can be generated in the chat completion.
+    #[serde(default)]
+    #[schema(example = "32")]
+    pub max_tokens: Option<u32>,
+
+    /// UNUSED
+    /// How many chat completion choices to generate for each input message. Note that you will be charged based on the
+    /// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
+    #[serde(default)]
+    #[schema(nullable = true, example = "2")]
+    pub n: Option<u32>,
+
+    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,
+    /// increasing the model's likelihood to talk about new topics
+    #[serde(default)]
+    #[schema(nullable = true, example = 0.1)]
+    pub presence_penalty: Option<f32>,
+
+    /// Up to 4 sequences where the API will stop generating further tokens.
+    #[serde(default)]
+    #[schema(nullable = true, example = "null")]
+    pub stop: Option<Vec<String>>,
+
+    #[serde(default = "bool::default")]
+    pub stream: bool,
+
+    #[schema(nullable = true, example = 42)]
+    pub seed: Option<u64>,
+
+    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
+    /// lower values like 0.2 will make it more focused and deterministic.
+    ///
+    /// We generally recommend altering this or `top_p` but not both.
+    #[serde(default)]
+    #[schema(nullable = true, example = 1.0)]
+    pub temperature: Option<f32>,
+
+    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
+    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
+    #[serde(default)]
+    #[schema(nullable = true, example = 0.95)]
+    pub top_p: Option<f32>,
+
+    /// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of
+    /// functions the model may generate JSON inputs for.
+    #[serde(default)]
+    #[schema(nullable = true, example = "null")]
+    pub tools: Option<Vec<Tool>>,
+
+    /// A prompt to be appended before the tools
+    #[serde(default = "default_tool_prompt")]
+    #[schema(
+        nullable = true,
+        example = "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\""
+    )]
+    pub tool_prompt: Option<String>,
+
+    /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
+    #[serde(default)]
+    #[schema(nullable = true, example = "null")]
+    pub tool_choice: Option<ToolType>,
+
+    /// Response format constraints for the generation.
+    ///
+    /// NOTE: A request can use `response_format` OR `tools` but not both.
+    #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub response_format: Option<GrammarType>,
+}
+
+fn default_tool_prompt() -> Option<String> {
+    Some(
+        "\nYou will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n".to_string(),
+    )
+}
+
+#[derive(Clone, Debug, Deserialize, PartialEq, Serialize, ToSchema)]
+#[serde(untagged)]
+pub enum ToolType {
+    OneOf,
+    FunctionName(String),
+    Function { function: FunctionName },
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub struct FunctionName {
+    pub name: String,
+}
+
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(from = "ToolTypeDeserializer")]
+pub struct ToolChoice(pub Option<ToolType>);
+
+#[derive(Deserialize)]
+#[serde(untagged)]
+enum ToolTypeDeserializer {
+    None(Option<String>),
+    Some(ToolType),
+}
+
+impl From<ToolTypeDeserializer> for ToolChoice {
+    fn from(value: ToolTypeDeserializer) -> Self {
+        match value {
+            ToolTypeDeserializer::None(opt) => match opt.as_deref() {
+                Some("none") => ToolChoice(None),
+                Some("auto") => ToolChoice(Some(ToolType::OneOf)),
+                Some(s) => ToolChoice(Some(ToolType::FunctionName(s.to_string()))),
+                None => ToolChoice(Some(ToolType::OneOf)),
+            },
+            ToolTypeDeserializer::Some(tool_type) => ToolChoice(Some(tool_type)),
+        }
+    }
+}
+
+#[derive(Debug, Deserialize, Serialize, ToSchema, PartialEq)]
+pub struct Tools {
+    #[serde(flatten)]
+    functions_map: FunctionsMap,
+    properties: Properties,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq)]
+struct FunctionsMap {
+    #[serde(rename = "$functions")]
+    functions: std::collections::HashMap<String, serde_json::Value>,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq)]
+struct FunctionRef {
+    #[serde(rename = "$ref")]
+    ref_path: String,
+}
+
+#[derive(Debug, Serialize, Deserialize, PartialEq)]
+struct Properties {
+    #[serde(serialize_with = "serialize_function")]
+    function: Vec<FunctionRef>,
+}
+
+fn serialize_function<S>(functions: &Vec<FunctionRef>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    use serde::ser::SerializeStruct;
+    let mut state = serializer.serialize_struct("Function", 1)?;
+    state.serialize_field("anyOf", functions)?;
+    state.end()
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize, ToSchema, Default, PartialEq)]
+pub(crate) struct FunctionDefinition {
+    #[serde(default)]
+    pub description: Option<String>,
+    pub name: String,
+    #[serde(alias = "parameters")]
+    pub arguments: serde_json::Value,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
+pub(crate) struct Tool {
+    // The type of the tool. Currently, only 'function' is supported.
+    #[schema(example = "function")]
+    pub r#type: String,
+    // Grab the tool as generic JSON for debugging purposes.
+    pub function: FunctionDefinition,
+}
+
+#[derive(Clone, Serialize, Deserialize, Default)]
+pub(crate) struct ChatTemplateInputs<'a> {
+    messages: Vec<TextMessage>,
+    bos_token: Option<&'a str>,
+    eos_token: Option<&'a str>,
+    add_generation_prompt: bool,
+    tools: Option<&'a str>,
+    tools_prompt: Option<&'a str>,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Default, Debug, PartialEq)]
+pub(crate) struct ToolCall {
+    pub id: String,
+    pub r#type: String,
+    pub function: FunctionDefinition,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+pub struct Url {
+    url: String,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum MessageChunk {
+    Text { text: String },
+    ImageUrl { image_url: Url },
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+pub struct Message {
+    #[schema(example = "user")]
+    role: String,
+    #[schema(example = "My name is David and I")]
+    pub content: MessageContent,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    #[schema(example = "\"David\"")]
+    name: Option<String>,
+}
+
+#[derive(Clone, Deserialize, Serialize, ToSchema, Debug, PartialEq)]
+#[serde(untagged)]
+pub enum MessageContent {
+    SingleText(String),
+    MultipleChunks(Vec<MessageChunk>),
+}
+
+// Pushing a chunk to a single text message will convert it to a multiple chunks message
+impl MessageContent {
+    pub fn push(&mut self, chunk: MessageChunk) {
+        match self {
+            MessageContent::SingleText(text) => {
+                *self =
+                    MessageContent::MultipleChunks(vec![MessageChunk::Text { text: text.clone() }]);
+            }
+            MessageContent::MultipleChunks(chunks) => {
+                chunks.push(chunk);
+            }
+        }
+    }
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+pub struct TextMessage {
+    #[schema(example = "user")]
+    pub role: String,
+    #[schema(example = "My name is David and I")]
+    pub content: String,
+}
+
+impl From<Message> for TextMessage {
+    fn from(value: Message) -> Self {
+        TextMessage {
+            role: value.role,
+            content: match value.content {
+                MessageContent::SingleText(text) => text,
+                MessageContent::MultipleChunks(chunks) => chunks
+                    .into_iter()
+                    .map(|chunk| match chunk {
+                        MessageChunk::Text { text } => text,
+                        MessageChunk::ImageUrl { image_url } => format!("![]({})", image_url.url),
+                    })
+                    .collect::<Vec<_>>()
+                    .join(""),
+            },
+        }
+    }
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+pub struct ToolCallMessage {
+    #[schema(example = "assistant")]
+    role: String,
+    tool_calls: Vec<ToolCall>,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize, Debug, PartialEq)]
+#[serde(untagged)]
+pub(crate) enum OutputMessage {
+    ChatMessage(TextMessage),
+    ToolCall(ToolCallMessage),
+}
+
 #[derive(Clone, Debug, Deserialize, ToSchema)]
 pub(crate) struct GenerateRequest {
     #[schema(example = "My name is Olivier and I")]
@@ -203,7 +1073,7 @@ pub struct PrefillToken {
     logprob: f32,
 }
 
-#[derive(Debug, Serialize, ToSchema)]
+#[derive(Debug, Serialize, ToSchema, Clone)]
 pub struct Token {
     #[schema(example = 0)]
     id: u32,
@@ -215,8 +1085,21 @@ pub struct Token {
     special: bool,
 }
 
-#[derive(Serialize, ToSchema)]
+#[derive(Debug, Serialize, ToSchema)]
+pub struct SimpleToken {
+    #[schema(example = 0)]
+    id: u32,
+    #[schema(example = "test")]
+    text: String,
+    #[schema(example = 0)]
+    start: usize,
+    #[schema(example = 2)]
+    stop: usize,
+}
+
+#[derive(Debug, Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
+#[schema(example = "Length")]
 pub(crate) enum FinishReason {
     #[schema(rename = "length")]
     Length,
@@ -227,6 +1110,16 @@ pub(crate) enum FinishReason {
     StopSequence,
 }
 
+impl std::fmt::Display for FinishReason {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            FinishReason::Length => write!(f, "length"),
+            FinishReason::EndOfSequenceToken => write!(f, "eos_token"),
+            FinishReason::StopSequence => write!(f, "stop_sequence"),
+        }
+    }
+}
+
 #[derive(Serialize, ToSchema)]
 pub(crate) struct BestOfSequence {
     #[schema(example = "test")]
@@ -267,6 +1160,10 @@ pub(crate) struct GenerateResponse {
     pub details: Option<Details>,
 }
 
+#[derive(Serialize, ToSchema)]
+#[serde(transparent)]
+pub(crate) struct TokenizeResponse(Vec<SimpleToken>);
+
 #[derive(Serialize, ToSchema)]
 pub(crate) struct StreamDetails {
     #[schema(example = "length")]
@@ -279,6 +1176,7 @@ pub(crate) struct StreamDetails {
 
 #[derive(Serialize, ToSchema)]
 pub(crate) struct StreamResponse {
+    pub index: u32,
     pub token: Token,
     #[serde(skip_serializing_if = "Vec::is_empty")]
     pub top_tokens: Vec<Token>,
@@ -296,20 +1194,181 @@ pub(crate) struct ErrorResponse {
 
 #[cfg(test)]
 mod tests {
-    use std::io::Write;
+    use super::*;
+    use serde_json::json;
     use tokenizers::Tokenizer;
 
     pub(crate) async fn get_tokenizer() -> Tokenizer {
-        if !std::path::Path::new("tokenizer.json").exists() {
-            let content = reqwest::get("https://huggingface.co/gpt2/raw/main/tokenizer.json")
-                .await
-                .unwrap()
-                .bytes()
-                .await
-                .unwrap();
-            let mut file = std::fs::File::create("tokenizer.json").unwrap();
-            file.write_all(&content).unwrap();
-        }
-        Tokenizer::from_file("tokenizer.json").unwrap()
+        let api = hf_hub::api::sync::Api::new().unwrap();
+        let repo = api.model("gpt2".to_string());
+        let filename = repo.get("tokenizer.json").unwrap();
+        Tokenizer::from_file(filename).unwrap()
+    }
+
+    #[test]
+    fn test_hub_nested_tokens_tokenizer_config() {
+        // this is a subset of the tokenizer.json file
+        // in this case we expect the tokens to be encoded as simple strings
+        let json_content = r#"{
+            "chat_template": "test",
+            "bos_token": "<｜begin▁of▁sentence｜>",
+            "eos_token": "<｜end▁of▁sentence｜>"
+        }"#;
+
+        let config: HubTokenizerConfig = serde_json::from_str(json_content).unwrap();
+
+        // check that we successfully parsed the tokens
+        assert_eq!(
+            config.chat_template,
+            Some(ChatTemplateVersions::Single("test".to_string()))
+        );
+        assert_eq!(
+            config.bos_token,
+            Some(TokenizerConfigToken::String(
+                "<｜begin▁of▁sentence｜>".to_string()
+            ))
+        );
+        assert_eq!(
+            config.eos_token,
+            Some(TokenizerConfigToken::String(
+                "<｜end▁of▁sentence｜>".to_string()
+            ))
+        );
+
+        // in this case we expect the tokens to be encoded as structured tokens
+        // we want the content of the structured token
+        let json_content = r#"{
+            "chat_template": "test",
+            "bos_token": {
+              "__type": "AddedToken",
+              "content": "<｜begin▁of▁sentence｜>",
+              "lstrip": false,
+              "normalized": true,
+              "rstrip": false,
+              "single_word": false
+            },
+            "eos_token": {
+              "__type": "AddedToken",
+              "content": "<｜end▁of▁sentence｜>",
+              "lstrip": false,
+              "normalized": true,
+              "rstrip": false,
+              "single_word": false
+            }
+        }"#;
+
+        let config: HubTokenizerConfig = serde_json::from_str(json_content).unwrap();
+
+        // check that we successfully parsed the tokens
+        assert_eq!(
+            config.chat_template,
+            Some(ChatTemplateVersions::Single("test".to_string()))
+        );
+        assert_eq!(
+            config.bos_token,
+            Some(TokenizerConfigToken::Object {
+                content: "<｜begin▁of▁sentence｜>".to_string()
+            })
+        );
+        assert_eq!(
+            config.eos_token,
+            Some(TokenizerConfigToken::Object {
+                content: "<｜end▁of▁sentence｜>".to_string()
+            })
+        );
+    }
+
+    #[test]
+    fn test_chat_simple_string() {
+        let json = json!({
+            "model": "",
+            "messages": [{
+                "role": "user",
+                "content": "What is Deep Learning?"
+            }]
+        });
+        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
+
+        assert_eq!(
+            request.messages[0],
+            Message {
+                role: "user".to_string(),
+                content: MessageContent::SingleText("What is Deep Learning?".to_string()),
+                name: None
+            }
+        );
+    }
+
+    #[test]
+    fn test_chat_request() {
+        let json = json!({
+            "model": "",
+            "messages": [{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Whats in this image?"},
+                    {"type": "image_url", "image_url": {"url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"}},
+                ]
+            }]
+        });
+        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
+
+        assert_eq!(
+            request.messages[0],
+            Message{
+                role: "user".to_string(),
+                content: MessageContent::MultipleChunks(vec![
+                    MessageChunk::Text { text: "Whats in this image?".to_string() },
+                    MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() }},
+                ]),
+                name: None
+            }
+        );
+    }
+
+    #[test]
+    fn text_message_convert() {
+        let message = Message{
+                role: "user".to_string(),
+                content: MessageContent::MultipleChunks(vec![
+                    MessageChunk::Text { text: "Whats in this image?".to_string() },
+                    MessageChunk::ImageUrl { image_url: Url { url: "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png".to_string() } }
+                ]),
+                name: None
+            };
+        let textmsg: TextMessage = message.into();
+        assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)");
+    }
+    #[test]
+    fn openai_output() {
+        let message = OutputMessage::ChatMessage(TextMessage {
+            role: "assistant".to_string(),
+            content: "This is the answer".to_string(),
+        });
+        let serialized = serde_json::to_string(&message).unwrap();
+        assert_eq!(
+            serialized,
+            r#"{"role":"assistant","content":"This is the answer"}"#
+        );
+
+        let message = OutputMessage::ToolCall(ToolCallMessage {
+            role: "assistant".to_string(),
+            tool_calls: vec![ToolCall {
+                id: "0".to_string(),
+                r#type: "function".to_string(),
+                function: FunctionDefinition {
+                    description: None,
+                    name: "myfn".to_string(),
+                    arguments: json!({
+                        "format": "csv"
+                    }),
+                },
+            }],
+        });
+        let serialized = serde_json::to_string(&message).unwrap();
+        assert_eq!(
+            serialized,
+            r#"{"role":"assistant","tool_calls":[{"id":"0","type":"function","function":{"description":null,"name":"myfn","arguments":{"format":"csv"}}}]}"#
+        );
     }
 }
diff --git a/router/src/main.rs b/router/src/main.rs
index 4903c066..21cd6649 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -1,28 +1,36 @@
-/// Text Generation Inference webserver entrypoint
 use axum::http::HeaderValue;
 use clap::Parser;
+use clap::Subcommand;
+use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
+use hf_hub::{Cache, Repo, RepoType};
 use opentelemetry::sdk::propagation::TraceContextPropagator;
 use opentelemetry::sdk::trace;
 use opentelemetry::sdk::trace::Sampler;
 use opentelemetry::sdk::Resource;
 use opentelemetry::{global, KeyValue};
 use opentelemetry_otlp::WithExportConfig;
+use std::fs::File;
+use std::io::BufReader;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
-use std::path::Path;
-use std::time::Duration;
-use text_generation_client::{ClientError, ShardedClient};
-use text_generation_router::{server, HubModelInfo};
+use std::path::{Path, PathBuf};
+use text_generation_router::config::Config;
+use text_generation_router::{
+    server, HubModelInfo, HubPreprocessorConfig, HubProcessorConfig, HubTokenizerConfig,
+};
 use thiserror::Error;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use tokenizers::{processors::template::TemplateProcessing, Tokenizer};
 use tower_http::cors::AllowOrigin;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
-use tracing_subscriber::{EnvFilter, Layer};
+use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
 
 /// App Configuration
 #[derive(Parser, Debug)]
 #[clap(author, version, about, long_about = None)]
 struct Args {
+    #[command(subcommand)]
+    command: Option<Commands>,
+
     #[clap(default_value = "128", long, env)]
     max_concurrent_requests: usize,
     #[clap(default_value = "2", long, env)]
@@ -32,7 +40,7 @@ struct Args {
     #[clap(default_value = "5", long, env)]
     max_top_n_tokens: u32,
     #[clap(default_value = "1024", long, env)]
-    max_input_length: usize,
+    max_input_tokens: usize,
     #[clap(default_value = "2048", long, env)]
     max_total_tokens: usize,
     #[clap(default_value = "1.2", long, env)]
@@ -43,6 +51,8 @@ struct Args {
     max_batch_total_tokens: Option<u32>,
     #[clap(default_value = "20", long, env)]
     max_waiting_tokens: usize,
+    #[clap(long, env)]
+    max_batch_size: Option<usize>,
     #[clap(default_value = "0.0.0.0", long, env)]
     hostname: String,
     #[clap(default_value = "3000", long, short, env)]
@@ -52,6 +62,8 @@ struct Args {
     #[clap(default_value = "bigscience/bloom", long, env)]
     tokenizer_name: String,
     #[clap(long, env)]
+    tokenizer_config_path: Option<String>,
+    #[clap(long, env)]
     revision: Option<String>,
     #[clap(default_value = "2", long, env)]
     validation_workers: usize,
@@ -59,6 +71,8 @@ struct Args {
     json_output: bool,
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
+    #[clap(default_value = "text-generation-inference.router", long, env)]
+    otlp_service_name: String,
     #[clap(long, env)]
     cors_allow_origin: Option<Vec<String>>,
     #[clap(long, env)]
@@ -67,45 +81,73 @@ struct Args {
     ngrok_authtoken: Option<String>,
     #[clap(long, env)]
     ngrok_edge: Option<String>,
+    #[clap(long, env, default_value_t = false)]
+    messages_api_enabled: bool,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
+    #[clap(default_value = "4", long, env)]
+    max_client_batch_size: usize,
 }
 
-fn main() -> Result<(), RouterError> {
-    // Get args
+#[derive(Debug, Subcommand)]
+enum Commands {
+    PrintSchema,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), RouterError> {
     let args = Args::parse();
+
     // Pattern match configuration
     let Args {
         max_concurrent_requests,
         max_best_of,
         max_stop_sequences,
         max_top_n_tokens,
-        max_input_length,
+        max_input_tokens,
         max_total_tokens,
         waiting_served_ratio,
         max_batch_prefill_tokens,
         max_batch_total_tokens,
         max_waiting_tokens,
+        max_batch_size,
         hostname,
         port,
         master_shard_uds_path,
         tokenizer_name,
+        tokenizer_config_path,
         revision,
         validation_workers,
         json_output,
         otlp_endpoint,
+        otlp_service_name,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        command,
     } = args;
 
+    let print_schema_command = match command {
+        Some(Commands::PrintSchema) => true,
+        None => {
+            // only init logging if we are not running the print schema command
+            init_logging(otlp_endpoint, otlp_service_name, json_output);
+            false
+        }
+    };
+
     // Validate args
-    if max_input_length >= max_total_tokens {
+    if max_input_tokens >= max_total_tokens {
         return Err(RouterError::ArgumentValidation(
-            "`max_input_length` must be < `max_total_tokens`".to_string(),
+            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
         ));
     }
-    if max_input_length as u32 > max_batch_prefill_tokens {
-        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_length`. Given: {max_batch_prefill_tokens} and {max_input_length}")));
+    if max_input_tokens as u32 > max_batch_prefill_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
     }
 
     if validation_workers == 0 {
@@ -135,164 +177,253 @@ fn main() -> Result<(), RouterError> {
     });
 
     // Parse Huggingface hub token
-    let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
+    let authorization_token = std::env::var("HF_TOKEN")
+        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+        .ok();
 
     // Tokenizer instance
     // This will only be used to validate payloads
     let local_path = Path::new(&tokenizer_name);
-    let local_model = local_path.exists() && local_path.is_dir();
-    let tokenizer = if local_model {
-        // Load local tokenizer
-        Tokenizer::from_file(local_path.join("tokenizer.json")).ok()
-    } else {
-        // Download and instantiate tokenizer
-        // We need to download it outside of the Tokio runtime
-        let params = FromPretrainedParameters {
-            revision: revision.clone().unwrap_or("main".to_string()),
-            auth_token: authorization_token.clone(),
-            ..Default::default()
-        };
-        Tokenizer::from_pretrained(tokenizer_name.clone(), Some(params)).ok()
+
+    // Shared API builder initialization
+    let api_builder = || {
+        let mut builder = ApiBuilder::new()
+            .with_progress(false)
+            .with_token(authorization_token);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+
+        builder
     };
 
-    // Launch Tokio runtime
-    tokio::runtime::Builder::new_multi_thread()
-        .enable_all()
-        .build()?
-        .block_on(async {
-            init_logging(otlp_endpoint, json_output);
+    // Decide if we need to use the API based on the revision and local path
+    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
 
-            if tokenizer.is_none() {
-                tracing::warn!(
-                    "Could not find a fast tokenizer implementation for {tokenizer_name}"
-                );
-                tracing::warn!("Rust input length validation and truncation is disabled");
-            }
-
-            // Get Model info
-            let model_info = match local_model {
-                true => HubModelInfo {
-                    model_id: tokenizer_name.clone(),
-                    sha: None,
-                    pipeline_tag: None,
-                },
-                false => get_model_info(&tokenizer_name, revision, authorization_token)
-                    .await
-                    .unwrap_or_else(|| {
-                        tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
-                        HubModelInfo {
-                            model_id: tokenizer_name.to_string(),
-                            sha: None,
-                            pipeline_tag: None,
-                        }
-                    }),
-            };
-
-            // if pipeline-tag == text-generation we default to return_full_text = true
-            let compat_return_full_text = match &model_info.pipeline_tag {
-                None => {
-                    tracing::warn!("no pipeline tag found for model {tokenizer_name}");
-                    false
-                }
-                Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
-            };
-
-            // Instantiate sharded client from the master unix socket
-            let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
-                .await
-                .map_err(RouterError::Connection)?;
-            // Clear the cache; useful if the webserver rebooted
-            sharded_client
-                .clear_cache(None)
-                .await
-                .map_err(RouterError::Cache)?;
-            // Get info from the shard
-            let shard_info = sharded_client.info().await.map_err(RouterError::Info)?;
-
-            // Warmup model
-            tracing::info!("Warming up model");
-            let max_supported_batch_total_tokens = match sharded_client
-                .warmup(max_input_length as u32, max_batch_prefill_tokens)
-                .await
-                .map_err(RouterError::Warmup)?
-            {
-                // Older models do not support automatic max-batch-total-tokens
-                None => {
-                    let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
-                        16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)),
-                    );
-                    tracing::warn!("Model does not support automatic max batch total tokens");
-                    max_batch_total_tokens
-                }
-                // Flash attention models return their max supported total tokens
-                Some(max_supported_batch_total_tokens) => {
-                    // Warn if user added his own max-batch-total-tokens as we will ignore it
-                    if max_batch_total_tokens.is_some() {
-                        tracing::warn!(
-                            "`--max-batch-total-tokens` is deprecated for Flash \
-                        Attention models."
-                        );
-                        tracing::warn!(
-                            "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
-                        );
-                    }
-                    if max_total_tokens as u32 > max_supported_batch_total_tokens {
-                        return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_supported_batch_total_tokens}")));
-                    }
-
-                    max_supported_batch_total_tokens
-                }
-            };
-            tracing::info!("Setting max batch total tokens to {max_supported_batch_total_tokens}");
-            tracing::info!("Connected");
-
-            let addr = match hostname.parse() {
-                Ok(ip) => SocketAddr::new(ip, port),
+    // Initialize API if needed
+    #[derive(Clone)]
+    enum Type {
+        Api(Api),
+        Cache(Cache),
+        None,
+    }
+    let api = if use_api {
+        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
+            let cache = Cache::default();
+            tracing::warn!("Offline mode active using cache defaults");
+            Type::Cache(cache)
+        } else {
+            tracing::info!("Using the Hugging Face API");
+            match api_builder().build() {
+                Ok(api) => Type::Api(api),
                 Err(_) => {
-                    tracing::warn!("Invalid hostname, defaulting to 0.0.0.0");
-                    SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port)
+                    tracing::warn!("Unable to build the Hugging Face API");
+                    Type::None
                 }
-            };
+            }
+        }
+    } else {
+        Type::None
+    };
 
-            // Run server
-            server::run(
+    // Load tokenizer and model info
+    let (
+        tokenizer_filename,
+        config_filename,
+        tokenizer_config_filename,
+        preprocessor_config_filename,
+        processor_config_filename,
+        model_info,
+    ) = match api {
+        Type::None => (
+            Some(local_path.join("tokenizer.json")),
+            Some(local_path.join("config.json")),
+            Some(local_path.join("tokenizer_config.json")),
+            Some(local_path.join("preprocessor_config.json")),
+            Some(local_path.join("processor_config.json")),
+            None,
+        ),
+        Type::Api(api) => {
+            let api_repo = api.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.clone().unwrap_or_else(|| "main".to_string()),
+            ));
+
+            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
+                Ok(tokenizer_filename) => Some(tokenizer_filename),
+                Err(_) => get_base_tokenizer(&api, &api_repo).await,
+            };
+            let config_filename = api_repo.get("config.json").await.ok();
+            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
+            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
+            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
+
+            let model_info = if let Some(model_info) = get_model_info(&api_repo).await {
+                Some(model_info)
+            } else {
+                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
+                None
+            };
+            (
+                tokenizer_filename,
+                config_filename,
+                tokenizer_config_filename,
+                preprocessor_config_filename,
+                processor_config_filename,
                 model_info,
-                shard_info,
-                compat_return_full_text,
-                max_concurrent_requests,
-                max_best_of,
-                max_stop_sequences,
-                max_top_n_tokens,
-                max_input_length,
-                max_total_tokens,
-                waiting_served_ratio,
-                max_batch_prefill_tokens,
-                max_supported_batch_total_tokens,
-                max_waiting_tokens,
-                sharded_client,
-                tokenizer,
-                validation_workers,
-                addr,
-                cors_allow_origin,
-                ngrok,
-                ngrok_authtoken,
-                ngrok_edge,
             )
-                .await?;
-            Ok(())
-        })
+        }
+        Type::Cache(cache) => {
+            let repo = cache.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.clone().unwrap_or_else(|| "main".to_string()),
+            ));
+            (
+                repo.get("tokenizer.json"),
+                repo.get("config.json"),
+                repo.get("tokenizer_config.json"),
+                repo.get("preprocessor_config.json"),
+                repo.get("processor_config.json"),
+                None,
+            )
+        }
+    };
+    let config: Option<Config> = config_filename.and_then(|filename| {
+        std::fs::read_to_string(filename)
+            .ok()
+            .as_ref()
+            .and_then(|c| {
+                let config: Result<Config, _> = serde_json::from_str(c);
+                if let Err(err) = &config {
+                    tracing::warn!("Could not parse config {err:?}");
+                }
+                config.ok()
+            })
+    });
+    let model_info = model_info.unwrap_or_else(|| HubModelInfo {
+        model_id: tokenizer_name.to_string(),
+        sha: None,
+        pipeline_tag: None,
+    });
+
+    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
+    {
+        HubTokenizerConfig::from_file(filename)
+    } else {
+        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
+    };
+    let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
+        tracing::warn!("Could not find tokenizer config locally and no API specified");
+        HubTokenizerConfig::default()
+    });
+
+    let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
+        let mut tokenizer = Tokenizer::from_file(filename).ok();
+        if let Some(tokenizer) = &mut tokenizer {
+            if let Some(class) = &tokenizer_config.tokenizer_class {
+                if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{
+                    if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) {
+                        tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205");
+                        tokenizer.with_post_processor(post_processor);
+                    }
+                }
+            }
+        }
+        tokenizer
+    });
+
+    let preprocessor_config =
+        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
+    let processor_config = processor_config_filename
+        .and_then(HubProcessorConfig::from_file)
+        .unwrap_or_default();
+
+    tracing::info!("Using config {config:?}");
+    if tokenizer.is_none() {
+        tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
+        tracing::warn!("Rust input length validation and truncation is disabled");
+    }
+
+    // if pipeline-tag == text-generation we default to return_full_text = true
+    let compat_return_full_text = match &model_info.pipeline_tag {
+        None => {
+            tracing::warn!("no pipeline tag found for model {tokenizer_name}");
+            true
+        }
+        Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
+    };
+
+    // Determine the server port based on the feature and environment variable.
+    let port = if cfg!(feature = "google") {
+        std::env::var("AIP_HTTP_PORT")
+            .map(|aip_http_port| aip_http_port.parse::<u16>().unwrap_or(port))
+            .unwrap_or(port)
+    } else {
+        port
+    };
+
+    let addr = match hostname.parse() {
+        Ok(ip) => SocketAddr::new(ip, port),
+        Err(_) => {
+            tracing::warn!("Invalid hostname, defaulting to 0.0.0.0");
+            SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port)
+        }
+    };
+
+    // Run server
+    server::run(
+        master_shard_uds_path,
+        model_info,
+        compat_return_full_text,
+        max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
+        max_top_n_tokens,
+        max_input_tokens,
+        max_total_tokens,
+        waiting_served_ratio,
+        max_batch_prefill_tokens,
+        max_batch_total_tokens,
+        max_waiting_tokens,
+        max_batch_size,
+        tokenizer,
+        config,
+        validation_workers,
+        addr,
+        cors_allow_origin,
+        ngrok,
+        ngrok_authtoken,
+        ngrok_edge,
+        tokenizer_config,
+        preprocessor_config,
+        processor_config,
+        messages_api_enabled,
+        disable_grammar_support,
+        max_client_batch_size,
+        print_schema_command,
+    )
+    .await?;
+    Ok(())
 }
 
 /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
 ///     - otlp_endpoint is an optional URL to an Open Telemetry collector
+///     - otlp_service_name service name to appear in APM
 ///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
 ///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
-fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
+///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
+fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
     let mut layers = Vec::new();
 
     // STDOUT/STDERR layer
+    let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string());
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_file(true)
+        .with_ansi(ansi)
         .with_line_number(true);
 
     let fmt_layer = match json_output {
@@ -316,7 +447,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
                 trace::config()
                     .with_resource(Resource::new(vec![KeyValue::new(
                         "service.name",
-                        "text-generation-inference.router",
+                        otlp_service_name,
                     )]))
                     .with_sampler(Sampler::AlwaysOn),
             )
@@ -324,13 +455,26 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
 
         if let Ok(tracer) = tracer {
             layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
-            axum_tracing_opentelemetry::init_propagator().unwrap();
+            init_tracing_opentelemetry::init_propagator().unwrap();
         };
     }
 
     // Filter events with LOG_LEVEL
-    let env_filter =
-        EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
+    let varname = "LOG_LEVEL";
+    let env_filter = if let Ok(log_level) = std::env::var(varname) {
+        // Override to avoid simple logs to be spammed with tokio level informations
+        let log_level = match &log_level[..] {
+            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
+            "info" => "text_generation_launcher=info,text_generation_router=info",
+            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
+            log_level => log_level,
+        };
+        EnvFilter::builder()
+            .with_default_directive(LevelFilter::INFO.into())
+            .parse_lossy(log_level)
+    } else {
+        EnvFilter::new("info")
+    };
 
     tracing_subscriber::registry()
         .with(env_filter)
@@ -339,30 +483,8 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
 }
 
 /// get model info from the Huggingface Hub
-pub async fn get_model_info(
-    model_id: &str,
-    revision: Option<String>,
-    token: Option<String>,
-) -> Option<HubModelInfo> {
-    let revision = match revision {
-        None => {
-            tracing::warn!("`--revision` is not set");
-            tracing::warn!("We strongly advise to set it to a known supported commit.");
-            "main".to_string()
-        }
-        Some(revision) => revision,
-    };
-
-    let client = reqwest::Client::new();
-    // Poor man's urlencode
-    let revision = revision.replace('/', "%2F");
-    let url = format!("https://huggingface.co/api/models/{model_id}/revision/{revision}");
-    let mut builder = client.get(url).timeout(Duration::from_secs(5));
-    if let Some(token) = token {
-        builder = builder.bearer_auth(token);
-    }
-
-    let response = builder.send().await.ok()?;
+pub async fn get_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
+    let response = api.info_request().send().await.ok()?;
 
     if response.status().is_success() {
         let hub_model_info: HubModelInfo =
@@ -379,20 +501,160 @@ pub async fn get_model_info(
     }
 }
 
+/// get base tokenizer
+pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
+    let config_filename = api_repo.get("config.json").await.ok()?;
+
+    // Open the file in read-only mode with buffer.
+    let file = File::open(config_filename).ok()?;
+    let reader = BufReader::new(file);
+
+    // Read the JSON contents of the file as an instance of `User`.
+    let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
+
+    if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
+        let api_base_repo = api.repo(Repo::with_revision(
+            base_model_id.to_string(),
+            RepoType::Model,
+            "main".to_string(),
+        ));
+
+        api_base_repo.get("tokenizer.json").await.ok()
+    } else {
+        None
+    }
+}
+
+/// get tokenizer_config from the Huggingface Hub
+pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
+    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
+
+    // Open the file in read-only mode with buffer.
+    let file = File::open(tokenizer_config_filename).ok()?;
+    let reader = BufReader::new(file);
+
+    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    let tokenizer_config: HubTokenizerConfig = serde_json::from_reader(reader)
+        .map_err(|e| {
+            tracing::warn!("Unable to parse tokenizer config: {}", e);
+            e
+        })
+        .ok()?;
+
+    Some(tokenizer_config)
+}
+
+/// Create a post_processor for the LlamaTokenizer
+pub fn create_post_processor(
+    tokenizer: &Tokenizer,
+    tokenizer_config: &HubTokenizerConfig,
+) -> Result<TemplateProcessing, tokenizers::processors::template::TemplateProcessingBuilderError> {
+    let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true);
+    let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false);
+
+    let bos_token = tokenizer_config.bos_token.as_ref();
+    let eos_token = tokenizer_config.eos_token.as_ref();
+
+    if add_bos_token && bos_token.is_none() {
+        panic!("add_bos_token = true but bos_token is None");
+    }
+
+    if add_eos_token && eos_token.is_none() {
+        panic!("add_eos_token = true but eos_token is None");
+    }
+
+    let mut single = Vec::new();
+    let mut pair = Vec::new();
+    let mut special_tokens = Vec::new();
+
+    if add_bos_token {
+        if let Some(bos) = bos_token {
+            let bos_token_id = tokenizer
+                .token_to_id(bos.as_str())
+                .expect("Should have found the bos token id");
+            special_tokens.push((bos.as_str(), bos_token_id));
+            single.push(format!("{}:0", bos.as_str()));
+            pair.push(format!("{}:0", bos.as_str()));
+        }
+    }
+
+    single.push("$A:0".to_string());
+    pair.push("$A:0".to_string());
+
+    if add_eos_token {
+        if let Some(eos) = eos_token {
+            let eos_token_id = tokenizer
+                .token_to_id(eos.as_str())
+                .expect("Should have found the eos token id");
+            special_tokens.push((eos.as_str(), eos_token_id));
+            single.push(format!("{}:0", eos.as_str()));
+            pair.push(format!("{}:0", eos.as_str()));
+        }
+    }
+
+    if add_bos_token {
+        if let Some(bos) = bos_token {
+            pair.push(format!("{}:1", bos.as_str()));
+        }
+    }
+
+    pair.push("$B:1".to_string());
+
+    if add_eos_token {
+        if let Some(eos) = eos_token {
+            pair.push(format!("{}:1", eos.as_str()));
+        }
+    }
+
+    let post_processor = TemplateProcessing::builder()
+        .try_single(single)?
+        .try_pair(pair)?
+        .special_tokens(special_tokens)
+        .build()?;
+
+    Ok(post_processor)
+}
+
 #[derive(Debug, Error)]
 enum RouterError {
     #[error("Argument validation error: {0}")]
     ArgumentValidation(String),
-    #[error("Unable to connect to the Python model shards: {0}")]
-    Connection(ClientError),
-    #[error("Unable to clear the Python model shards cache: {0}")]
-    Cache(ClientError),
-    #[error("Unable to get the Python model shards info: {0}")]
-    Info(ClientError),
-    #[error("Unable to warmup the Python model shards: {0}")]
-    Warmup(ClientError),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
     #[error("Tokio runtime failed to start: {0}")]
     Tokio(#[from] std::io::Error),
-    #[error("Axum webserver failed: {0}")]
-    Axum(#[from] axum::BoxError),
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use text_generation_router::TokenizerConfigToken;
+
+    #[test]
+    fn test_create_post_processor() {
+        let tokenizer_config = HubTokenizerConfig {
+            add_bos_token: None,
+            add_eos_token: None,
+            bos_token: Some(TokenizerConfigToken::String("<s>".to_string())),
+            eos_token: Some(TokenizerConfigToken::String("</s>".to_string())),
+            chat_template: None,
+            tokenizer_class: None,
+            completion_template: None,
+        };
+
+        let tokenizer =
+            Tokenizer::from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", None).unwrap();
+        let post_processor = create_post_processor(&tokenizer, &tokenizer_config).unwrap();
+
+        let expected = TemplateProcessing::builder()
+            .try_single("<s>:0 $A:0")
+            .unwrap()
+            .try_pair("<s>:0 $A:0 <s>:1 $B:1")
+            .unwrap()
+            .special_tokens(vec![("<s>".to_string(), 1)])
+            .build()
+            .unwrap();
+
+        assert_eq!(post_processor, expected);
+    }
 }
diff --git a/router/src/server.rs b/router/src/server.rs
index 91164098..db8b16ad 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1,29 +1,53 @@
 /// HTTP Server logic
-use crate::health::Health;
-use crate::infer::{InferError, InferResponse, InferStreamResponse};
+use crate::config::Config;
+use crate::infer::v2::SchedulerV2;
+use crate::infer::v3::SchedulerV3;
+use crate::infer::{HealthCheck, Scheduler};
+use crate::infer::{Infer, InferError, InferResponse, InferStreamResponse, ToolGrammar};
+#[cfg(feature = "kserve")]
+use crate::kserve::{
+    kerve_server_metadata, kserve_health_live, kserve_health_ready, kserve_model_infer,
+    kserve_model_metadata, kserve_model_metadata_ready,
+};
 use crate::validation::ValidationError;
 use crate::{
-    BestOfSequence, CompatGenerateRequest, Details, ErrorResponse, FinishReason,
-    GenerateParameters, GenerateRequest, GenerateResponse, HubModelInfo, Infer, Info, PrefillToken,
-    StreamDetails, StreamResponse, Token, Validation,
+    BestOfSequence, Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest,
+    GenerateResponse, GrammarType, HubModelInfo, HubProcessorConfig, HubTokenizerConfig, Info,
+    Message, PrefillToken, SimpleToken, StreamDetails, StreamResponse, Token, TokenizeResponse,
+    Usage, Validation,
 };
+use crate::{
+    ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
+    ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob,
+    ChatRequest, Chunk, CompatGenerateRequest, Completion, CompletionComplete, CompletionFinal,
+    CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool, VertexRequest,
+    VertexResponse,
+};
+use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolType};
+use async_stream::__private::AsyncStream;
 use axum::extract::Extension;
 use axum::http::{HeaderMap, Method, StatusCode};
 use axum::response::sse::{Event, KeepAlive, Sse};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use axum::{http, Json, Router};
-use axum_tracing_opentelemetry::opentelemetry_tracing_layer;
+use axum_tracing_opentelemetry::middleware::OtelAxumLayer;
 use futures::stream::StreamExt;
+use futures::stream::{FuturesOrdered, FuturesUnordered};
 use futures::Stream;
+use futures::TryStreamExt;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
+use serde_json::Value;
 use std::convert::Infallible;
 use std::net::SocketAddr;
 use std::sync::atomic::AtomicBool;
 use std::sync::Arc;
-use text_generation_client::{ShardInfo, ShardedClient};
+use text_generation_client::{v2, v3, ClientError, ShardInfo};
+use thiserror::Error;
 use tokenizers::Tokenizer;
+use tokio::select;
 use tokio::signal;
+use tokio::sync::oneshot;
 use tokio::time::Instant;
 use tower_http::cors::{AllowOrigin, CorsLayer};
 use tracing::{info_span, instrument, Instrument};
@@ -56,6 +80,7 @@ example = json ! ({"error": "Incomplete generation"})),
 async fn compat_generate(
     Extension(default_return_full_text): Extension<bool>,
     infer: Extension<Infer>,
+    compute_type: Extension<ComputeType>,
     Json(mut req): Json<CompatGenerateRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
     // default return_full_text given the pipeline_tag
@@ -65,11 +90,11 @@ async fn compat_generate(
 
     // switch on stream
     if req.stream {
-        Ok(generate_stream(infer, Json(req.into()))
+        Ok(generate_stream(infer, compute_type, Json(req.into()))
             .await
             .into_response())
     } else {
-        let (headers, Json(generation)) = generate(infer, Json(req.into())).await?;
+        let (headers, Json(generation)) = generate(infer, compute_type, Json(req.into())).await?;
         // wrap generation inside a Vec to match api-inference
         Ok((headers, Json(vec![generation])).into_response())
     }
@@ -99,7 +124,9 @@ example = json ! ({"error": "unhealthy", "error_type": "healthcheck"})),
 )]
 #[instrument(skip(health))]
 /// Health check method
-async fn health(mut health: Extension<Health>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
+async fn health(
+    mut health: Extension<HealthCheck>,
+) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
     match health.check().await {
         true => Ok(()),
         false => Err((
@@ -144,13 +171,24 @@ seed,
 )]
 async fn generate(
     infer: Extension<Infer>,
+    Extension(ComputeType(compute_type)): Extension<ComputeType>,
     Json(req): Json<GenerateRequest>,
 ) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
     let span = tracing::Span::current();
+    generate_internal(infer, ComputeType(compute_type), Json(req), span).await
+}
+
+pub(crate) async fn generate_internal(
+    infer: Extension<Infer>,
+    ComputeType(compute_type): ComputeType,
+    Json(req): Json<GenerateRequest>,
+    span: tracing::Span,
+) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
     let start_time = Instant::now();
     metrics::increment_counter!("tgi_request_count");
 
-    tracing::debug!("Input: {}", req.inputs);
+    // Do not long ultra long inputs, like image payloads.
+    tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
 
     let compute_characters = req.inputs.chars().count();
     let mut add_prompt = None;
@@ -170,6 +208,7 @@ async fn generate(
     };
 
     // Token details
+    let input_length = response._input_length;
     let details = match details {
         true => {
             // convert best_of_responses
@@ -185,9 +224,7 @@ async fn generate(
 
                         BestOfSequence {
                             generated_text: output_text,
-                            finish_reason: FinishReason::from(
-                                response.generated_text.finish_reason,
-                            ),
+                            finish_reason: response.generated_text.finish_reason,
                             generated_tokens: response.generated_text.generated_tokens,
                             prefill: response.prefill,
                             tokens: response.tokens,
@@ -199,7 +236,7 @@ async fn generate(
             });
 
             Some(Details {
-                finish_reason: FinishReason::from(response.generated_text.finish_reason),
+                finish_reason: response.generated_text.finish_reason,
                 generated_tokens: response.generated_text.generated_tokens,
                 prefill: response.prefill,
                 tokens: response.tokens,
@@ -228,10 +265,10 @@ async fn generate(
 
     // Headers
     let mut headers = HeaderMap::new();
-    headers.insert("x-compute-type", "gpu+optimized".parse().unwrap());
+    headers.insert("x-compute-type", compute_type.parse().unwrap());
     headers.insert(
         "x-compute-time",
-        total_time.as_millis().to_string().parse().unwrap(),
+        total_time.as_secs_f64().to_string().parse().unwrap(),
     );
     headers.insert(
         "x-compute-characters",
@@ -257,6 +294,11 @@ async fn generate(
         "x-time-per-token",
         time_per_token.as_millis().to_string().parse().unwrap(),
     );
+    headers.insert("x-prompt-tokens", input_length.into());
+    headers.insert(
+        "x-generated-tokens",
+        response.generated_text.generated_tokens.into(),
+    );
 
     // Metrics
     metrics::increment_counter!("tgi_request_success");
@@ -332,12 +374,30 @@ seed,
 )]
 async fn generate_stream(
     Extension(infer): Extension<Infer>,
+    Extension(compute_type): Extension<ComputeType>,
     Json(req): Json<GenerateRequest>,
 ) -> (
     HeaderMap,
     Sse<impl Stream<Item = Result<Event, Infallible>>>,
 ) {
     let span = tracing::Span::current();
+    let on_message_callback = |stream_token: StreamResponse| {
+        let event = Event::default();
+        event.json_data(stream_token).unwrap()
+    };
+    let (headers, response_stream) =
+        generate_stream_internal(infer, compute_type, Json(req), on_message_callback, span).await;
+    let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
+    (headers, sse)
+}
+
+async fn generate_stream_internal(
+    infer: Infer,
+    ComputeType(compute_type): ComputeType,
+    Json(req): Json<GenerateRequest>,
+    on_message_callback: impl Fn(StreamResponse) -> Event,
+    span: tracing::Span,
+) -> (HeaderMap, impl Stream<Item = Result<Event, Infallible>>) {
     let start_time = Instant::now();
     metrics::increment_counter!("tgi_request_count");
 
@@ -346,7 +406,7 @@ async fn generate_stream(
     let compute_characters = req.inputs.chars().count();
 
     let mut headers = HeaderMap::new();
-    headers.insert("x-compute-type", "gpu+optimized".parse().unwrap());
+    headers.insert("x-compute-type", compute_type.parse().unwrap());
     headers.insert(
         "x-compute-characters",
         compute_characters.to_string().parse().unwrap(),
@@ -378,9 +438,11 @@ async fn generate_stream(
         } else {
             match infer.generate_stream(req).instrument(info_span!(parent: &span, "async_stream")).await {
                 // Keep permit as long as generate_stream lives
-                Ok((_permit, mut response_stream)) => {
+                Ok((_permit, _input_length, mut response_stream)) => {
+                    let mut index = 0;
                     // Server-Sent Event stream
                     while let Some(response) = response_stream.next().await {
+                        index += 1;
                         match response {
                             Ok(response) => {
                                 match response {
@@ -395,13 +457,14 @@ async fn generate_stream(
 
                                         // StreamResponse
                                         let stream_token = StreamResponse {
+                                            index,
                                             token,
-                                            top_tokens: top_tokens,
+                                            top_tokens,
                                             generated_text: None,
                                             details: None,
                                         };
-
-                                        yield Ok(Event::default().json_data(stream_token).unwrap())
+                                        let event = on_message_callback(stream_token);
+                                        yield Ok(event);
                                     }
                                     // Yield event for last token and compute timings
                                     InferStreamResponse::End {
@@ -414,7 +477,7 @@ async fn generate_stream(
                                         // Token details
                                         let details = match details {
                                             true => Some(StreamDetails {
-                                                finish_reason: FinishReason::from(generated_text.finish_reason),
+                                                finish_reason: generated_text.finish_reason,
                                                 generated_tokens: generated_text.generated_tokens,
                                                 seed: generated_text.seed,
                                             }),
@@ -457,13 +520,16 @@ async fn generate_stream(
                                         tracing::info!(parent: &span, "Success");
 
                                         let stream_token = StreamResponse {
+                                            index,
                                             token,
-                                            top_tokens: top_tokens,
+                                            top_tokens,
                                             generated_text: Some(output_text),
                                             details
                                         };
 
-                                        yield Ok(Event::default().json_data(stream_token).unwrap());
+
+                                        let event = on_message_callback(stream_token);
+                                        yield Ok(event);
                                         break;
                                     }
                                 }
@@ -494,45 +560,878 @@ async fn generate_stream(
         }
     };
 
-    (headers, Sse::new(stream).keep_alive(KeepAlive::default()))
+    (headers, stream)
+}
+
+/// Generate tokens
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/v1/completions",
+request_body = CompletionRequest,
+responses(
+(status = 200, description = "Generated Chat Completion",
+content(
+("application/json" = Completion),
+("text/event-stream" = CompletionCompleteChunk),
+)),
+(status = 424, description = "Generation Error", body = ErrorResponse,
+example = json ! ({"error": "Request failed during generation"})),
+(status = 429, description = "Model is overloaded", body = ErrorResponse,
+example = json ! ({"error": "Model is overloaded"})),
+(status = 422, description = "Input validation error", body = ErrorResponse,
+example = json ! ({"error": "Input validation error"})),
+(status = 500, description = "Incomplete generation", body = ErrorResponse,
+example = json ! ({"error": "Incomplete generation"})),
+)
+)]
+#[instrument(
+skip_all,
+fields(
+// parameters = ? req.parameters,
+total_time,
+validation_time,
+queue_time,
+inference_time,
+time_per_token,
+seed,
+)
+)]
+async fn completions(
+    Extension(infer): Extension<Infer>,
+    Extension(compute_type): Extension<ComputeType>,
+    Extension(info): Extension<Info>,
+    Json(req): Json<CompletionRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    let span = tracing::Span::current();
+    metrics::increment_counter!("tgi_request_count");
+
+    let CompletionRequest {
+        max_tokens,
+        seed,
+        stop,
+        stream,
+        temperature,
+        ..
+    } = req;
+
+    let max_new_tokens = max_tokens.or(Some(100));
+    let stop = stop.unwrap_or_default();
+    // enable greedy only when temperature is 0
+    let (do_sample, temperature) = match temperature {
+        Some(temperature) if temperature == 0.0 => (false, None),
+        other => (true, other),
+    };
+
+    // if suffix is present throw an error
+    if req.suffix.is_some() {
+        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: "Suffix is not supported and can be achieved by preprocessing the prompt."
+                    .to_string(),
+                error_type: "suffix not supported".to_string(),
+            }),
+        ));
+    }
+
+    if req.prompt.0.len() > info.max_client_batch_size {
+        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: format!(
+                    "Number of prompts exceeds the maximum allowed batch size of {}",
+                    info.max_client_batch_size
+                ),
+                error_type: "batch size exceeded".to_string(),
+            }),
+        ));
+    }
+
+    let generate_requests: Vec<GenerateRequest> = req
+        .prompt
+        .0
+        .iter()
+        .map(|prompt| GenerateRequest {
+            inputs: prompt.to_string(),
+            parameters: GenerateParameters {
+                best_of: None,
+                temperature,
+                repetition_penalty: req.repetition_penalty,
+                frequency_penalty: req.frequency_penalty,
+                top_k: None,
+                top_p: req.top_p,
+                typical_p: None,
+                do_sample,
+                max_new_tokens,
+                return_full_text: None,
+                stop: stop.clone(),
+                truncate: None,
+                watermark: false,
+                details: true,
+                decoder_input_details: !stream,
+                seed,
+                top_n_tokens: None,
+                grammar: None,
+                ..Default::default()
+            },
+        })
+        .collect();
+
+    let mut x_compute_type = None;
+    let mut x_compute_characters = 0u32;
+    let mut x_accel_buffering = None;
+
+    if stream {
+        let mut response_streams = FuturesOrdered::new();
+        for (index, generate_request) in generate_requests.into_iter().enumerate() {
+            let model_id = info.model_id.clone();
+            let system_fingerprint =
+                format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
+            let infer_clone = infer.clone();
+            let compute_type_clone = compute_type.clone();
+            let span_clone = span.clone();
+
+            // Create a future for each generate_stream_internal call.
+            let generate_future = async move {
+                let on_message_callback = move |stream_token: StreamResponse| {
+                    let event = Event::default();
+
+                    let current_time = std::time::SystemTime::now()
+                        .duration_since(std::time::UNIX_EPOCH)
+                        .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                        .as_secs();
+
+                    event
+                        .json_data(Completion::Chunk(Chunk {
+                            id: "".to_string(),
+                            created: current_time,
+
+                            choices: vec![CompletionComplete {
+                                finish_reason: "".to_string(),
+                                index: index as u32,
+                                logprobs: None,
+                                text: stream_token.token.text,
+                            }],
+
+                            model: model_id.clone(),
+                            system_fingerprint: system_fingerprint.clone(),
+                        }))
+                        .unwrap_or_else(|_e| Event::default())
+                };
+
+                let (header_tx, header_rx) = oneshot::channel();
+                let (sse_tx, sse_rx) = tokio::sync::mpsc::unbounded_channel();
+
+                tokio::spawn(async move {
+                    let (header_map, sse) = generate_stream_internal(
+                        infer_clone.clone(),
+                        compute_type_clone.clone(),
+                        Json(generate_request),
+                        on_message_callback,
+                        span_clone.clone(),
+                    )
+                    .await;
+
+                    // send and dont wait for response
+                    let _ = header_tx.send(header_map);
+
+                    // pin an emit messages to the sse_tx
+                    let mut sse = Box::pin(sse);
+                    while let Some(event) = sse.next().await {
+                        if sse_tx.send(event).is_err() {
+                            tracing::error!("Failed to send event. Receiver dropped.");
+                            break;
+                        }
+                    }
+                });
+
+                (header_rx, sse_rx)
+            };
+            response_streams.push_back(generate_future);
+        }
+
+        let mut all_rxs = vec![];
+
+        while let Some((header_rx, sse_rx)) = response_streams.next().await {
+            all_rxs.push(sse_rx);
+
+            // get the headers from the first response of each stream
+            let headers = header_rx.await.map_err(|e| {
+                tracing::error!("Failed to get headers: {:?}", e);
+                (
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    Json(ErrorResponse {
+                        error: "Failed to get headers".to_string(),
+                        error_type: "headers".to_string(),
+                    }),
+                )
+            })?;
+            if x_compute_type.is_none() {
+                x_compute_type = headers
+                    .get("x-compute-type")
+                    .and_then(|v| v.to_str().ok())
+                    .map(|v| v.to_string());
+
+                x_accel_buffering = headers
+                    .get("x-accel-buffering")
+                    .and_then(|v| v.to_str().ok())
+                    .map(|v| v.to_string());
+            }
+            x_compute_characters += headers
+                .get("x-compute-characters")
+                .and_then(|v| v.to_str().ok())
+                .and_then(|v| v.parse().ok())
+                .unwrap_or(0);
+        }
+
+        let mut headers = HeaderMap::new();
+        if let Some(x_compute_type) = x_compute_type {
+            headers.insert("x-compute-type", x_compute_type.parse().unwrap());
+        }
+        headers.insert("x-compute-characters", x_compute_characters.into());
+        if let Some(x_accel_buffering) = x_accel_buffering {
+            headers.insert("x-accel-buffering", x_accel_buffering.parse().unwrap());
+        }
+
+        // now sink the sse streams into a single stream and remove the ones that are done
+        let stream: AsyncStream<Result<Event, Infallible>, _> = async_stream::stream! {
+            loop {
+                let mut i = 0;
+                while i < all_rxs.len() {
+                    let rx = &mut all_rxs[i];
+                    select! {
+                        Some(event) = rx.recv() => {
+                            yield event;
+                        }
+                        else => {
+                            all_rxs.remove(i);
+                            continue; // skip the increment to handle the next element at the same index
+                        }
+                    }
+                    i += 1; // only increment when no element was removed
+                }
+
+                if all_rxs.is_empty() {
+                    break;
+                }
+            }
+        };
+
+        let sse = Sse::new(stream).keep_alive(KeepAlive::default());
+        Ok((headers, sse).into_response())
+    } else {
+        let current_time = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+            .as_secs();
+
+        let responses = FuturesUnordered::new();
+        for (index, generate_request) in generate_requests.into_iter().enumerate() {
+            let infer_clone = infer.clone();
+            let compute_type_clone = compute_type.clone();
+            let span_clone = span.clone();
+            let response_future = async move {
+                let result = generate_internal(
+                    Extension(infer_clone),
+                    compute_type_clone,
+                    Json(generate_request),
+                    span_clone,
+                )
+                .await;
+                result.map(|(headers, generation)| (index, headers, generation))
+            };
+            responses.push(response_future);
+        }
+        let generate_responses = responses.try_collect::<Vec<_>>().await?;
+
+        let mut prompt_tokens = 0u32;
+        let mut completion_tokens = 0u32;
+        let mut total_tokens = 0u32;
+
+        let mut x_compute_time = 0u32;
+        let mut x_total_time = 0u32;
+        let mut x_validation_time = 0u32;
+        let mut x_queue_time = 0u32;
+        let mut x_inference_time = 0u32;
+        let mut x_time_per_token = 0u32;
+        let mut x_prompt_tokens = 0u32;
+        let mut x_generated_tokens = 0u32;
+
+        let choices = generate_responses
+            .into_iter()
+            .map(|(index, headers, Json(generation))| {
+                let details = generation.details.ok_or((
+                    // this should never happen but handle if details are missing unexpectedly
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                    Json(ErrorResponse {
+                        error: "No details in generation".to_string(),
+                        error_type: "no details".to_string(),
+                    }),
+                ))?;
+
+                if x_compute_type.is_none() {
+                    x_compute_type = headers
+                        .get("x-compute-type")
+                        .and_then(|v| v.to_str().ok())
+                        .map(|v| v.to_string());
+                }
+
+                // accumulate headers and usage from each response
+                x_compute_time += headers
+                    .get("x-compute-time")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_compute_characters += headers
+                    .get("x-compute-characters")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_total_time += headers
+                    .get("x-total-time")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_validation_time += headers
+                    .get("x-validation-time")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_queue_time += headers
+                    .get("x-queue-time")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_inference_time += headers
+                    .get("x-inference-time")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_time_per_token += headers
+                    .get("x-time-per-token")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_prompt_tokens += headers
+                    .get("x-prompt-tokens")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+                x_generated_tokens += headers
+                    .get("x-generated-tokens")
+                    .and_then(|v| v.to_str().ok()?.parse().ok())
+                    .unwrap_or(0);
+
+                prompt_tokens += details.prefill.len() as u32;
+                completion_tokens += details.generated_tokens;
+                total_tokens += details.prefill.len() as u32 + details.generated_tokens;
+
+                Ok(CompletionComplete {
+                    finish_reason: details.finish_reason.to_string(),
+                    index: index as u32,
+                    logprobs: None,
+                    text: generation.generated_text,
+                })
+            })
+            .collect::<Result<Vec<_>, _>>()
+            .map_err(|(status, Json(err))| (status, Json(err)))?;
+
+        let response = Completion::Final(CompletionFinal {
+            id: "".to_string(),
+            created: current_time,
+            model: info.model_id.clone(),
+            system_fingerprint: format!(
+                "{}-{}",
+                info.version,
+                info.docker_label.unwrap_or("native")
+            ),
+            choices,
+            usage: Usage {
+                prompt_tokens,
+                completion_tokens,
+                total_tokens,
+            },
+        });
+
+        // headers similar to `generate` but aggregated
+        let mut headers = HeaderMap::new();
+        if let Some(x_compute_type) = x_compute_type {
+            headers.insert("x-compute-type", x_compute_type.parse().unwrap());
+        }
+        headers.insert("x-compute-characters", x_compute_characters.into());
+        headers.insert("x-total-time", x_total_time.into());
+        headers.insert("x-validation-time", x_validation_time.into());
+        headers.insert("x-queue-time", x_queue_time.into());
+        headers.insert("x-inference-time", x_inference_time.into());
+        headers.insert("x-time-per-token", x_time_per_token.into());
+        headers.insert("x-prompt-tokens", x_prompt_tokens.into());
+        headers.insert("x-generated-tokens", x_generated_tokens.into());
+        if let Some(x_accel_buffering) = x_accel_buffering {
+            headers.insert("x-accel-buffering", x_accel_buffering.parse().unwrap());
+        }
+        Ok((headers, Json(response)).into_response())
+    }
+}
+
+/// Generate tokens
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/v1/chat/completions",
+request_body = ChatRequest,
+responses(
+(status = 200, description = "Generated Chat Completion",
+content(
+("application/json" = ChatCompletion),
+("text/event-stream" = ChatCompletionChunk),
+)),
+(status = 424, description = "Generation Error", body = ErrorResponse,
+example = json ! ({"error": "Request failed during generation"})),
+(status = 429, description = "Model is overloaded", body = ErrorResponse,
+example = json ! ({"error": "Model is overloaded"})),
+(status = 422, description = "Input validation error", body = ErrorResponse,
+example = json ! ({"error": "Input validation error"})),
+(status = 500, description = "Incomplete generation", body = ErrorResponse,
+example = json ! ({"error": "Incomplete generation"})),
+)
+)]
+#[instrument(
+skip_all,
+fields(
+// parameters = ? req.parameters,
+total_time,
+validation_time,
+queue_time,
+inference_time,
+time_per_token,
+seed,
+)
+)]
+async fn chat_completions(
+    Extension(infer): Extension<Infer>,
+    Extension(compute_type): Extension<ComputeType>,
+    Extension(info): Extension<Info>,
+    Json(req): Json<ChatRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    let span = tracing::Span::current();
+    metrics::increment_counter!("tgi_request_count");
+    let ChatRequest {
+        logprobs,
+        max_tokens,
+        messages,
+        presence_penalty,
+        seed,
+        stop,
+        stream,
+        tools,
+        tool_choice,
+        tool_prompt,
+        temperature,
+        response_format,
+        ..
+    } = req;
+
+    let repetition_penalty = presence_penalty.map(|x| x + 2.0);
+    let max_new_tokens = max_tokens.or(Some(100));
+    let logprobs = logprobs.unwrap_or(false);
+    let tool_prompt = tool_prompt.unwrap_or_default();
+    let stop = stop.unwrap_or_default();
+    // enable greedy only when temperature is 0
+    let (do_sample, temperature) = match temperature {
+        Some(temperature) if temperature == 0.0 => (false, None),
+        other => (true, other),
+    };
+
+    // response_format and tools are mutually exclusive
+    if response_format.is_some() && tools.as_ref().is_some() {
+        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: "Grammar and tools are mutually exclusive".to_string(),
+                error_type: "grammar and tools".to_string(),
+            }),
+        ));
+    }
+
+    // extract tool grammar if present
+    let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
+        Ok(grammar) => grammar,
+        Err(err) => {
+            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            tracing::error!("{err}");
+            return Err((
+                StatusCode::UNPROCESSABLE_ENTITY,
+                Json(ErrorResponse {
+                    error: err.to_string(),
+                    error_type: err.error_type().to_string(),
+                }),
+            ));
+        }
+    };
+
+    // determine the appropriate arguments for apply_chat_template
+    let tools_grammar_prompt = tool_grammar
+        .as_ref()
+        .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt));
+
+    let (tools_grammar_prompt, grammar) = match response_format {
+        Some(response_format) => (None, Some(response_format)),
+        None => (
+            tools_grammar_prompt.clone(),
+            tools_grammar_prompt.map(|(grammar, _)| grammar.clone()),
+        ),
+    };
+
+    // apply chat template to flatten the request into a single input
+    let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
+        Ok(inputs) => inputs,
+        Err(err) => {
+            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            tracing::error!("{err}");
+            return Err((
+                StatusCode::UNPROCESSABLE_ENTITY,
+                Json(ErrorResponse {
+                    error: err.to_string(),
+                    error_type: err.error_type().to_string(),
+                }),
+            ));
+        }
+    };
+
+    // build the request passing some parameters
+    let generate_request = GenerateRequest {
+        inputs: inputs.to_string(),
+        parameters: GenerateParameters {
+            best_of: None,
+            temperature,
+            repetition_penalty,
+            frequency_penalty: req.frequency_penalty,
+            top_k: None,
+            top_p: req.top_p,
+            typical_p: None,
+            do_sample,
+            max_new_tokens,
+            return_full_text: None,
+            stop,
+            truncate: None,
+            watermark: false,
+            details: true,
+            decoder_input_details: !stream,
+            seed,
+            top_n_tokens: req.top_logprobs,
+            grammar,
+            ..Default::default()
+        },
+    };
+
+    // static values that will be returned in all cases
+    let model_id = info.model_id.clone();
+    let system_fingerprint = format!("{}-{}", info.version, info.docker_label.unwrap_or("native"));
+
+    // switch on stream
+    if stream {
+        // pass this callback to the stream generation and build the required event structure
+        let on_message_callback = move |stream_token: StreamResponse| {
+            let event = Event::default();
+
+            let current_time = std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+                .as_secs();
+
+            let logprobs = logprobs.then(|| {
+                ChatCompletionLogprobs::from((stream_token.token.clone(), stream_token.top_tokens))
+            });
+
+            // replace the content with the tool calls if grammar is present
+            let (content, tool_calls) = if tool_grammar.is_some() {
+                (None, Some(vec![stream_token.token.text]))
+            } else {
+                let content = if !stream_token.token.special {
+                    Some(stream_token.token.text)
+                } else {
+                    None
+                };
+
+                (content, None)
+            };
+
+            event
+                .json_data(CompletionType::ChatCompletionChunk(
+                    ChatCompletionChunk::new(
+                        model_id.clone(),
+                        system_fingerprint.clone(),
+                        content,
+                        tool_calls,
+                        current_time,
+                        logprobs,
+                        stream_token.details.map(|d| d.finish_reason.to_string()),
+                    ),
+                ))
+                .unwrap_or_else(|e| {
+                    println!("Failed to serialize ChatCompletionChunk: {:?}", e);
+                    Event::default()
+                })
+        };
+
+        let (headers, response_stream) = generate_stream_internal(
+            infer,
+            compute_type,
+            Json(generate_request),
+            on_message_callback,
+            span,
+        )
+        .await;
+        let sse = Sse::new(response_stream).keep_alive(KeepAlive::default());
+        Ok((headers, sse).into_response())
+    } else {
+        let (headers, Json(generation)) =
+            generate_internal(Extension(infer), compute_type, Json(generate_request), span).await?;
+
+        let current_time = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_else(|_| std::time::Duration::from_secs(0))
+            .as_secs();
+
+        let (tool_calls, output) = if tool_grammar.is_some() {
+            // gen_text should be valid json
+            let gen_text_value: Value =
+                serde_json::from_str(&generation.generated_text).map_err(|e| {
+                    (
+                        StatusCode::UNPROCESSABLE_ENTITY,
+                        Json(ErrorResponse {
+                            error: e.to_string(),
+                            error_type: "Input validation error".to_string(),
+                        }),
+                    )
+                })?;
+            let tool_calls = vec![ToolCall {
+                id: "0".to_string(),
+                r#type: "function".to_string(),
+                function: FunctionDefinition {
+                    description: None,
+                    name: gen_text_value
+                        .get("function")
+                        .and_then(|f| f.get("_name"))
+                        .and_then(|name| name.as_str())
+                        .unwrap_or("default_function_name")
+                        .to_string(),
+                    // Serialize the JSON object obtained from "function" to an escaped JSON string
+                    arguments: gen_text_value
+                        .get("function")
+                        .map(|f| {
+                            let mut f_cloned = f.clone();
+                            if let Value::Object(ref mut props) = f_cloned {
+                                props.remove("_name");
+                            }
+                            f_cloned
+                        })
+                        .unwrap_or_default(),
+                },
+            }];
+            (Some(tool_calls), None)
+        } else {
+            (None, Some(generation.generated_text))
+        };
+        // build the complete response object with the full text
+        let response = CompletionType::ChatCompletion(ChatCompletion::new(
+            model_id,
+            system_fingerprint,
+            output,
+            current_time,
+            generation.details.unwrap(),
+            logprobs,
+            tool_calls,
+        ));
+
+        // wrap generation inside a Vec to match api-inference
+        Ok((headers, Json(response)).into_response())
+    }
+}
+
+/// Generate tokens from Vertex request
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/vertex",
+request_body = VertexRequest,
+responses(
+(status = 200, description = "Generated Text", body = VertexResponse),
+(status = 424, description = "Generation Error", body = ErrorResponse,
+example = json ! ({"error": "Request failed during generation"})),
+(status = 429, description = "Model is overloaded", body = ErrorResponse,
+example = json ! ({"error": "Model is overloaded"})),
+(status = 422, description = "Input validation error", body = ErrorResponse,
+example = json ! ({"error": "Input validation error"})),
+(status = 500, description = "Incomplete generation", body = ErrorResponse,
+example = json ! ({"error": "Incomplete generation"})),
+)
+)]
+#[instrument(
+    skip_all,
+    fields(
+        total_time,
+        validation_time,
+        queue_time,
+        inference_time,
+        time_per_token,
+        seed,
+    )
+)]
+async fn vertex_compatibility(
+    Extension(infer): Extension<Infer>,
+    Extension(compute_type): Extension<ComputeType>,
+    Json(req): Json<VertexRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    let span = tracing::Span::current();
+    metrics::increment_counter!("tgi_request_count");
+
+    // check that theres at least one instance
+    if req.instances.is_empty() {
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: "Input validation error".to_string(),
+                error_type: "Input validation error".to_string(),
+            }),
+        ));
+    }
+
+    // Process all instances
+    let predictions = req
+        .instances
+        .iter()
+        .map(|instance| {
+            let generate_request = GenerateRequest {
+                inputs: instance.inputs.clone(),
+                parameters: GenerateParameters {
+                    do_sample: true,
+                    max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
+                    seed: instance.parameters.as_ref().and_then(|p| p.seed),
+                    details: true,
+                    decoder_input_details: true,
+                    ..Default::default()
+                },
+            };
+
+            async {
+                generate_internal(
+                    Extension(infer.clone()),
+                    compute_type.clone(),
+                    Json(generate_request),
+                    span.clone(),
+                )
+                .await
+                .map(|(_, Json(generation))| generation.generated_text)
+                .map_err(|_| {
+                    (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        Json(ErrorResponse {
+                            error: "Incomplete generation".into(),
+                            error_type: "Incomplete generation".into(),
+                        }),
+                    )
+                })
+            }
+        })
+        .collect::<FuturesUnordered<_>>()
+        .try_collect::<Vec<_>>()
+        .await?;
+
+    let response = VertexResponse { predictions };
+    Ok((HeaderMap::new(), Json(response)).into_response())
+}
+
+/// Tokenize inputs
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/tokenize",
+request_body = GenerateRequest,
+responses(
+(status = 200, description = "Tokenized ids", body = TokenizeResponse),
+(status = 404, description = "No tokenizer found", body = ErrorResponse,
+example = json ! ({"error": "No fast tokenizer available"})),
+)
+)]
+#[instrument(skip_all)]
+async fn tokenize(
+    Extension(infer): Extension<Infer>,
+    Json(req): Json<GenerateRequest>,
+) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
+    let input = req.inputs.clone();
+    let encoding = infer.tokenize(req).await?;
+    if let Some(encoding) = encoding {
+        let tokens: Vec<SimpleToken> = encoding
+            .get_ids()
+            .iter()
+            .zip(encoding.get_offsets())
+            .map(|(&id, &(start, stop))| {
+                let text: String =
+                    String::from_utf8_lossy(&input.as_bytes()[start..stop]).to_string();
+                SimpleToken {
+                    id,
+                    text,
+                    start,
+                    stop,
+                }
+            })
+            .collect();
+        Ok(Json(TokenizeResponse(tokens)))
+    } else {
+        Err((
+            StatusCode::NOT_FOUND,
+            Json(ErrorResponse {
+                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
+                error_type: "no fast tokenizer".to_string(),
+            }),
+        ))
+    }
 }
 
 /// Prometheus metrics scrape endpoint
 #[utoipa::path(
-get,
-tag = "Text Generation Inference",
-path = "/metrics",
-responses((status = 200, description = "Prometheus Metrics", body = String))
+    get,
+    tag = "Text Generation Inference",
+    path = "/metrics",
+    responses((status = 200, description = "Prometheus Metrics", body = String))
 )]
 async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
     prom_handle.render()
 }
 
+#[derive(Clone, Debug)]
+pub(crate) struct ComputeType(String);
+
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
+    master_shard_uds_path: String,
     model_info: HubModelInfo,
-    shard_info: ShardInfo,
     compat_return_full_text: bool,
     max_concurrent_requests: usize,
     max_best_of: usize,
     max_stop_sequences: usize,
     max_top_n_tokens: u32,
-    max_input_length: usize,
+    max_input_tokens: usize,
     max_total_tokens: usize,
     waiting_served_ratio: f32,
     max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
+    max_batch_total_tokens: Option<u32>,
     max_waiting_tokens: usize,
-    client: ShardedClient,
+    max_batch_size: Option<usize>,
     tokenizer: Option<Tokenizer>,
+    config: Option<Config>,
     validation_workers: usize,
     addr: SocketAddr,
     allow_origin: Option<AllowOrigin>,
     ngrok: bool,
-    ngrok_authtoken: Option<String>,
-    ngrok_edge: Option<String>,
-) -> Result<(), axum::BoxError> {
+    _ngrok_authtoken: Option<String>,
+    _ngrok_edge: Option<String>,
+    tokenizer_config: HubTokenizerConfig,
+    preprocessor_config: Option<HubPreprocessorConfig>,
+    processor_config: HubProcessorConfig,
+    messages_api_enabled: bool,
+    grammar_support: bool,
+    max_client_batch_size: usize,
+    print_schema_command: bool,
+) -> Result<(), WebServerError> {
     // OpenAPI documentation
     #[derive(OpenApi)]
     #[openapi(
@@ -542,6 +1441,9 @@ pub async fn run(
     compat_generate,
     generate,
     generate_stream,
+    chat_completions,
+    completions,
+    tokenize,
     metrics,
     ),
     components(
@@ -549,16 +1451,43 @@ pub async fn run(
     Info,
     CompatGenerateRequest,
     GenerateRequest,
+    GrammarType,
+    ChatRequest,
+    Message,
+    ChatCompletionComplete,
+    ChatCompletionChoice,
+    ChatCompletionDelta,
+    ChatCompletionChunk,
+    ChatCompletionLogprob,
+    ChatCompletionLogprobs,
+    ChatCompletionTopLogprob,
+    ChatCompletion,
+    CompletionRequest,
+    CompletionComplete,
+    Chunk,
+    Completion,
+    CompletionFinal,
+    Prompt,
     GenerateParameters,
     PrefillToken,
     Token,
     GenerateResponse,
+    TokenizeResponse,
+    SimpleToken,
     BestOfSequence,
     Details,
     FinishReason,
     StreamResponse,
     StreamDetails,
     ErrorResponse,
+    GrammarType,
+    Usage,
+    DeltaToolCall,
+    ToolType,
+    Tool,
+    ToolCall,
+    Function,
+    FunctionDefinition,
     )
     ),
     tags(
@@ -575,27 +1504,166 @@ pub async fn run(
     struct ApiDoc;
 
     // Create state
+    if print_schema_command {
+        let api_doc = ApiDoc::openapi();
+        let api_doc = serde_json::to_string_pretty(&api_doc).unwrap();
+        println!("{}", api_doc);
+        std::process::exit(0);
+    }
+
+    // Open connection, get model info and warmup
+    let (scheduler, health_ext, shard_info, max_batch_total_tokens): (
+        Arc<dyn Scheduler + Send + Sync>,
+        HealthCheck,
+        ShardInfo,
+        u32,
+    ) = {
+        // Helper function to check both v2 and v3
+        let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+            match max_supported_batch_total_tokens {
+                // Older models do not support automatic max-batch-total-tokens
+                None => {
+                    let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
+                        16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)),
+                    );
+                    tracing::warn!("Model does not support automatic max batch total tokens");
+                    Ok(max_batch_total_tokens)
+                }
+                // Flash attention models return their max supported total tokens
+                Some(max_supported_batch_total_tokens) => {
+                    // Warn if user added his own max-batch-total-tokens as we will ignore it
+                    if max_batch_total_tokens.is_some() {
+                        tracing::warn!(
+                            "`--max-batch-total-tokens` is deprecated for Flash \
+                        Attention models."
+                        );
+                        tracing::warn!(
+                            "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
+                        );
+                    }
+                    if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                        return Err(WebServerError::NotEnoughMemory(max_total_tokens));
+                    }
+
+                    Ok(max_supported_batch_total_tokens)
+                }
+            }
+        };
+
+        let generation_health = Arc::new(AtomicBool::new(false));
+
+        match v3::ShardedClient::connect_uds(master_shard_uds_path.clone()).await {
+            Ok(mut sharded_client) => {
+                // server is running on v3
+                // Clear the cache; useful if the webserver rebooted
+                sharded_client
+                    .clear_cache(None)
+                    .await
+                    .map_err(WebServerError::Cache)?;
+                // Get info from the shard
+                let shard_info = sharded_client.info().await.map_err(WebServerError::Info)?;
+
+                // Warmup model
+                tracing::info!("Warming up model");
+                let max_batch_total_tokens = check_max_batch_total_tokens(
+                    sharded_client
+                        .warmup(
+                            max_input_tokens as u32,
+                            max_batch_prefill_tokens,
+                            max_total_tokens as u32,
+                            max_batch_size,
+                        )
+                        .await
+                        .map_err(WebServerError::Warmup)?,
+                )?;
+
+                let health_ext =
+                    HealthCheck::new(Arc::new(sharded_client.clone()), generation_health.clone());
+                let scheduler = Arc::new(SchedulerV3::new(
+                    sharded_client,
+                    waiting_served_ratio,
+                    max_batch_prefill_tokens,
+                    max_batch_total_tokens,
+                    max_waiting_tokens,
+                    max_batch_size,
+                    shard_info.requires_padding,
+                    shard_info.window_size,
+                    shard_info.speculate,
+                    generation_health,
+                ));
+                tracing::info!("Using scheduler V3");
+
+                (scheduler, health_ext, shard_info, max_batch_total_tokens)
+            }
+            Err(_) => {
+                let mut sharded_client = v2::ShardedClient::connect_uds(master_shard_uds_path)
+                    .await
+                    .map_err(WebServerError::Connection)?;
+
+                // server is running on v2
+                // Clear the cache; useful if the webserver rebooted
+                sharded_client
+                    .clear_cache(None)
+                    .await
+                    .map_err(WebServerError::Cache)?;
+                // Get info from the shard
+                let shard_info = sharded_client.info().await.map_err(WebServerError::Info)?;
+
+                // Warmup model
+                tracing::info!("Warming up model");
+                let max_batch_total_tokens = check_max_batch_total_tokens(
+                    sharded_client
+                        .warmup(
+                            max_input_tokens as u32,
+                            max_batch_prefill_tokens,
+                            max_total_tokens as u32,
+                            max_batch_size,
+                        )
+                        .await
+                        .map_err(WebServerError::Warmup)?,
+                )?;
+
+                let health_ext =
+                    HealthCheck::new(Arc::new(sharded_client.clone()), generation_health.clone());
+                let scheduler = Arc::new(SchedulerV2::new(
+                    sharded_client,
+                    waiting_served_ratio,
+                    max_batch_prefill_tokens,
+                    max_batch_total_tokens,
+                    max_waiting_tokens,
+                    max_batch_size,
+                    shard_info.requires_padding,
+                    shard_info.window_size,
+                    shard_info.speculate,
+                    generation_health,
+                ));
+                tracing::info!("Using scheduler V2");
+
+                (scheduler, health_ext, shard_info, max_batch_total_tokens)
+            }
+        }
+    };
+    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
+
     let validation = Validation::new(
         validation_workers,
         tokenizer,
+        config,
+        preprocessor_config,
         max_best_of,
         max_stop_sequences,
         max_top_n_tokens,
-        max_input_length,
+        max_input_tokens,
         max_total_tokens,
+        grammar_support,
     );
-    let generation_health = Arc::new(AtomicBool::new(false));
-    let health_ext = Health::new(client.clone(), generation_health.clone());
+
     let infer = Infer::new(
-        client,
+        scheduler,
         validation,
-        waiting_served_ratio,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        max_waiting_tokens,
         max_concurrent_requests,
-        shard_info.requires_padding,
-        generation_health,
+        tokenizer_config,
+        processor_config,
     );
 
     // Duration buckets
@@ -612,7 +1680,7 @@ pub async fn run(
     // Input Length buckets
     let input_length_matcher = Matcher::Full(String::from("tgi_request_input_length"));
     let input_length_buckets: Vec<f64> = (0..100)
-        .map(|x| (max_input_length as f64 / 100.0) * (x + 1) as f64)
+        .map(|x| (max_input_tokens as f64 / 100.0) * (x + 1) as f64)
         .collect();
     // Generated tokens buckets
     let generated_tokens_matcher = Matcher::Full(String::from("tgi_request_generated_tokens"));
@@ -627,6 +1695,9 @@ pub async fn run(
     // Batch size buckets
     let batch_size_matcher = Matcher::Full(String::from("tgi_batch_next_size"));
     let batch_size_buckets: Vec<f64> = (0..1024).map(|x| (x + 1) as f64).collect();
+    // Speculated tokens buckets
+    let skipped_matcher = Matcher::Full(String::from("tgi_request_skipped_tokens"));
+    let skipped_buckets: Vec<f64> = (0..shard_info.speculate + 1).map(|x| x as f64).collect();
 
     // Prometheus handler
     let builder = PrometheusBuilder::new()
@@ -639,6 +1710,8 @@ pub async fn run(
         .set_buckets_for_metric(max_new_tokens_matcher, &max_new_tokens_buckets)
         .unwrap()
         .set_buckets_for_metric(batch_size_matcher, &batch_size_buckets)
+        .unwrap()
+        .set_buckets_for_metric(skipped_matcher, &skipped_buckets)
         .unwrap();
     let prom_handle = builder
         .install_recorder()
@@ -661,86 +1734,161 @@ pub async fn run(
         max_concurrent_requests,
         max_best_of,
         max_stop_sequences,
-        max_input_length,
+        max_input_tokens,
         max_total_tokens,
         waiting_served_ratio,
         max_batch_total_tokens,
         max_waiting_tokens,
+        max_batch_size,
         validation_workers,
+        max_client_batch_size,
+        router: env!("CARGO_PKG_NAME"),
         version: env!("CARGO_PKG_VERSION"),
         sha: option_env!("VERGEN_GIT_SHA"),
         docker_label: option_env!("DOCKER_LABEL"),
     };
 
-    // Create router
-    let app = Router::new()
-        .merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", ApiDoc::openapi()))
-        // Base routes
+    #[allow(unused_mut)] // mut is needed for conditional compilation
+    let mut doc = ApiDoc::openapi();
+
+    #[cfg(feature = "google")]
+    {
+        use crate::VertexInstance;
+
+        #[derive(OpenApi)]
+        #[openapi(
+            paths(vertex_compatibility),
+            components(schemas(VertexInstance, VertexRequest, VertexResponse))
+        )]
+        struct VertexApiDoc;
+
+        doc.merge(VertexApiDoc::openapi());
+    }
+
+    #[cfg(feature = "kserve")]
+    {
+        use crate::kserve::{
+            InferenceOutput, InferenceRequest, LiveResponse, MetadataServerResponse, OutputChunk,
+            ReadyResponse,
+        };
+        use crate::kserve::{
+            __path_kerve_server_metadata, __path_kserve_health_live, __path_kserve_health_ready,
+            __path_kserve_model_infer, __path_kserve_model_metadata,
+            __path_kserve_model_metadata_ready,
+        };
+
+        #[derive(OpenApi)]
+        #[openapi(
+            paths(
+                kserve_health_live,
+                kserve_health_ready,
+                kerve_server_metadata,
+                kserve_model_metadata,
+                kserve_model_metadata_ready,
+                kserve_model_infer,
+            ),
+            components(schemas(
+                InferenceOutput,
+                InferenceRequest,
+                LiveResponse,
+                MetadataServerResponse,
+                OutputChunk,
+                ReadyResponse,
+            ))
+        )]
+        struct KServeApiDoc;
+
+        doc.merge(KServeApiDoc::openapi());
+    }
+
+    // Configure Swagger UI
+    let swagger_ui = SwaggerUi::new("/docs").url("/api-doc/openapi.json", doc);
+
+    // Define base and health routes
+    let base_routes = Router::new()
         .route("/", post(compat_generate))
+        .route("/", get(health))
         .route("/info", get(get_model_info))
         .route("/generate", post(generate))
         .route("/generate_stream", post(generate_stream))
-        // AWS Sagemaker route
-        .route("/invocations", post(compat_generate))
-        // Base Health route
+        .route("/v1/chat/completions", post(chat_completions))
+        .route("/v1/completions", post(completions))
+        .route("/vertex", post(vertex_compatibility))
+        .route("/tokenize", post(tokenize))
         .route("/health", get(health))
-        // Inference API health route
-        .route("/", get(health))
-        // AWS Sagemaker health route
         .route("/ping", get(health))
-        // Prometheus metrics route
-        .route("/metrics", get(metrics))
+        .route("/metrics", get(metrics));
+
+    // Conditional AWS Sagemaker route
+    let aws_sagemaker_route = if messages_api_enabled {
+        Router::new().route("/invocations", post(chat_completions)) // Use 'chat_completions' for OAI_ENABLED
+    } else {
+        Router::new().route("/invocations", post(compat_generate)) // Use 'compat_generate' otherwise
+    };
+
+    let compute_type =
+        ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string()));
+
+    // Combine routes and layers
+    let mut app = Router::new()
+        .merge(swagger_ui)
+        .merge(base_routes)
+        .merge(aws_sagemaker_route);
+
+    #[cfg(feature = "google")]
+    {
+        tracing::info!("Built with `google` feature");
+        tracing::info!(
+            "Environment variables `AIP_PREDICT_ROUTE` and `AIP_HEALTH_ROUTE` will be respected."
+        );
+        if let Ok(env_predict_route) = std::env::var("AIP_PREDICT_ROUTE") {
+            app = app.route(&env_predict_route, post(vertex_compatibility));
+        }
+        if let Ok(env_health_route) = std::env::var("AIP_HEALTH_ROUTE") {
+            app = app.route(&env_health_route, get(health));
+        }
+    }
+
+    #[cfg(feature = "kserve")]
+    {
+        tracing::info!("Built with `kserve` feature");
+        app = app
+            .route(
+                "/v2/models/:model_name/versions/:model_version/infer",
+                post(kserve_model_infer),
+            )
+            .route(
+                "/v2/models/:model_name/versions/:model_version",
+                get(kserve_model_metadata),
+            )
+            .route("/v2/health/ready", get(kserve_health_ready))
+            .route("/v2/health/live", get(kserve_health_live))
+            .route("/v2", get(kerve_server_metadata))
+            .route(
+                "/v2/models/:model_name/versions/:model_version/ready",
+                get(kserve_model_metadata_ready),
+            );
+    }
+
+    // add layers after routes
+    app = app
         .layer(Extension(info))
         .layer(Extension(health_ext.clone()))
         .layer(Extension(compat_return_full_text))
         .layer(Extension(infer))
+        .layer(Extension(compute_type))
         .layer(Extension(prom_handle.clone()))
-        .layer(opentelemetry_tracing_layer())
+        .layer(OtelAxumLayer::default())
         .layer(cors_layer);
 
+    tracing::info!("Connected");
+
     if ngrok {
         #[cfg(feature = "ngrok")]
         {
-            use ngrok::config::TunnelBuilder;
-
-            let _ = addr;
-
-            let authtoken =
-                ngrok_authtoken.expect("`ngrok-authtoken` must be set when using ngrok tunneling");
-
-            let edge = ngrok_edge.expect("`ngrok-edge` must be set when using ngrok tunneling");
-
-            let tunnel = ngrok::Session::builder()
-                .authtoken(authtoken)
-                .connect()
-                .await
-                .unwrap()
-                .labeled_tunnel()
-                .label("edge", edge);
-
-            let listener = tunnel.listen().await.unwrap();
-
-            // Run prom metrics and health locally too
-            tokio::spawn(
-                axum::Server::bind(&addr)
-                    .serve(
-                        Router::new()
-                            .route("/health", get(health))
-                            .route("/metrics", get(metrics))
-                            .layer(Extension(health_ext))
-                            .layer(Extension(prom_handle))
-                            .into_make_service(),
-                    )
-                    //Wait until all requests are finished to shut down
-                    .with_graceful_shutdown(shutdown_signal()),
-            );
+            panic!("ngrok feature is not functional with axum=0.7 and hyper=1, waiting on https://github.com/ngrok/ngrok-rust/pull/137/files to re-enable.");
 
             // Run server
-            axum::Server::builder(listener)
-                .serve(app.into_make_service())
-                //Wait until all requests are finished to shut down
-                .with_graceful_shutdown(shutdown_signal())
-                .await?;
         }
         #[cfg(not(feature = "ngrok"))]
         {
@@ -753,11 +1901,12 @@ pub async fn run(
         }
     } else {
         // Run server
-        axum::Server::bind(&addr)
-            .serve(app.into_make_service())
-            // Wait until all requests are finished to shut down
+
+        let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
+        axum::serve(listener, app)
             .with_graceful_shutdown(shutdown_signal())
-            .await?;
+            .await
+            .map_err(|err| WebServerError::Axum(Box::new(err)))?;
     }
     Ok(())
 }
@@ -790,17 +1939,6 @@ async fn shutdown_signal() {
     opentelemetry::global::shutdown_tracer_provider();
 }
 
-impl From<i32> for FinishReason {
-    fn from(finish_reason: i32) -> Self {
-        let finish_reason = text_generation_client::FinishReason::from_i32(finish_reason).unwrap();
-        match finish_reason {
-            text_generation_client::FinishReason::Length => FinishReason::Length,
-            text_generation_client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
-            text_generation_client::FinishReason::StopSequence => FinishReason::StopSequence,
-        }
-    }
-}
-
 /// Convert to Axum supported formats
 impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
     fn from(err: InferError) -> Self {
@@ -809,6 +1947,8 @@ impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
             InferError::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS,
             InferError::ValidationError(_) => StatusCode::UNPROCESSABLE_ENTITY,
             InferError::IncompleteGeneration => StatusCode::INTERNAL_SERVER_ERROR,
+            InferError::TemplateError(_) => StatusCode::UNPROCESSABLE_ENTITY,
+            InferError::ToolError(_) => StatusCode::UNPROCESSABLE_ENTITY,
         };
 
         (
@@ -831,3 +1971,19 @@ impl From<InferError> for Event {
             .unwrap()
     }
 }
+
+#[derive(Debug, Error)]
+pub enum WebServerError {
+    #[error("Unable to connect to the Python model shards: {0}")]
+    Connection(ClientError),
+    #[error("Unable to clear the Python model shards cache: {0}")]
+    Cache(ClientError),
+    #[error("Unable to get the Python model shards info: {0}")]
+    Info(ClientError),
+    #[error("Unable to warmup the Python model shards: {0}")]
+    Warmup(ClientError),
+    #[error("Not enough memory to handle `max_total_tokens={0}`")]
+    NotEnoughMemory(usize),
+    #[error("Axum error: {0}")]
+    Axum(#[from] axum::BoxError),
+}
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 6c67f0ff..12cf2ab3 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -1,13 +1,23 @@
 /// Payload validation logic
+use crate::config::Config;
 use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
-use crate::{GenerateParameters, GenerateRequest};
+use crate::{
+    GenerateParameters, GenerateRequest, GrammarType, HubPreprocessorConfig, Idefics2Preprocessor,
+};
+use base64::{engine::general_purpose::STANDARD, Engine};
+use image::{io::Reader as ImageReader, ImageFormat};
+use jsonschema::{Draft, JSONSchema};
 use rand::{thread_rng, Rng};
-use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
+use serde_json::Value;
+use std::io::Cursor;
+use std::iter;
+use text_generation_client::{Chunk, Image, InputChunk};
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
-use tokenizers::TruncationDirection;
+use tokio::sync::mpsc;
 use tokio::sync::oneshot;
 use tracing::{instrument, Span};
+use {once_cell::sync::Lazy, regex::Regex};
 
 /// Validation
 #[derive(Debug, Clone)]
@@ -18,35 +28,53 @@ pub struct Validation {
     max_top_n_tokens: u32,
     max_input_length: usize,
     max_total_tokens: usize,
+    disable_grammar_support: bool,
     /// Channel to communicate with the background tokenization task
-    sender: Option<flume::Sender<TokenizerRequest>>,
+    sender: Option<mpsc::UnboundedSender<TokenizerRequest>>,
 }
 
 impl Validation {
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         workers: usize,
         tokenizer: Option<Tokenizer>,
+        config: Option<Config>,
+        preprocessor_config: Option<HubPreprocessorConfig>,
         max_best_of: usize,
         max_stop_sequences: usize,
         max_top_n_tokens: u32,
         max_input_length: usize,
         max_total_tokens: usize,
+        disable_grammar_support: bool,
     ) -> Self {
         // If we have a fast tokenizer
         let sender = if let Some(tokenizer) = tokenizer {
-            // Create channel
-            let (validation_sender, validation_receiver) = flume::unbounded();
+            // Create round robin channel
+            let (validation_sender, validation_round_robin_receiver) = mpsc::unbounded_channel();
+            let mut senders = Vec::with_capacity(workers);
 
             // Create workers
             for _ in 0..workers {
                 let tokenizer_clone = tokenizer.clone();
-                let receiver_clone = validation_receiver.clone();
+                let config_clone = config.clone();
+                let preprocessor_config_clone = preprocessor_config.clone();
+                let (tokenizer_sender, tokenizer_receiver) = mpsc::unbounded_channel();
+                senders.push(tokenizer_sender);
 
                 // Spawn worker
                 tokio::task::spawn_blocking(move || {
-                    tokenizer_worker(tokenizer_clone, receiver_clone)
+                    tokenizer_worker(
+                        tokenizer_clone,
+                        config_clone,
+                        preprocessor_config_clone,
+                        tokenizer_receiver,
+                    )
                 });
             }
+
+            // Create tokenization round robin task
+            tokio::spawn(round_robin_task(validation_round_robin_receiver, senders));
+
             Some(validation_sender)
         } else {
             None
@@ -59,16 +87,16 @@ impl Validation {
             max_top_n_tokens,
             max_input_length,
             max_total_tokens,
+            disable_grammar_support,
         }
     }
 
-    #[instrument(skip_all)]
-    async fn validate_input(
+    #[instrument(skip(self, inputs))]
+    pub async fn tokenize(
         &self,
         inputs: String,
         truncate: Option<usize>,
-        max_new_tokens: u32,
-    ) -> Result<(String, usize), ValidationError> {
+    ) -> Result<Option<(tokenizers::Encoding, Vec<InputChunk>)>, ValidationError> {
         // If we have a fast tokenizer
         if let Some(sender) = &self.sender {
             // Create response channel
@@ -81,9 +109,35 @@ impl Validation {
 
             // Await on response channel
             // Unwrap is safe here
-            let (inputs, input_length) = response_receiver.await.unwrap()?;
+            let encoding = response_receiver.await.unwrap()?;
+            Ok(Some(encoding))
+        } else {
+            Ok(None)
+        }
+    }
+
+    #[instrument(skip(self, inputs))]
+    async fn validate_input(
+        &self,
+        inputs: String,
+        truncate: Option<usize>,
+        max_new_tokens: Option<u32>,
+    ) -> Result<(Vec<InputChunk>, usize, u32), ValidationError> {
+        // If we have a fast tokenizer
+        if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
+            // Create response channel
+            let input_length = if let Some(truncate) = truncate {
+                std::cmp::min(encoding.len(), truncate)
+            } else {
+                encoding.len()
+            };
 
             // Get total tokens
+            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
+                max_new_tokens
+            } else {
+                self.max_total_tokens.saturating_sub(input_length) as u32
+            };
             let total_tokens = input_length + max_new_tokens as usize;
 
             // Validate MaxTotalTokens
@@ -104,24 +158,34 @@ impl Validation {
             }
 
             metrics::histogram!("tgi_request_input_length", input_length as f64);
-            Ok((inputs, input_length))
+            Ok((inputs, input_length, max_new_tokens))
         }
         // Return inputs without validation
         else {
             // In this case, we don't know the real length in tokens of the inputs
             // However, the inputs will be truncated by the python servers
             // We make sure that truncate + max_new_tokens <= self.max_total_tokens
-            let input_length = truncate.unwrap_or(self.max_input_length);
+            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
+                max_new_tokens
+            } else if let Some(truncate) = truncate {
+                self.max_total_tokens.saturating_sub(truncate) as u32
+            } else {
+                return Err(ValidationError::UnsetMaxNewTokens);
+            };
+            let mut input_length = truncate.unwrap_or(self.max_input_length);
 
+            // We don't have a tokenizer, therefore we have no idea how long is the query, let
+            // them through and hope for the best.
             // Validate MaxNewTokens
             if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
-                return Err(ValidationError::MaxNewTokens(
-                    self.max_total_tokens - self.max_input_length,
-                    max_new_tokens,
-                ));
+                input_length = input_length.saturating_sub(max_new_tokens as usize);
             }
 
-            Ok((inputs, input_length))
+            Ok((
+                vec![Chunk::Text(inputs).into()],
+                input_length,
+                max_new_tokens,
+            ))
         }
     }
 
@@ -135,6 +199,7 @@ impl Validation {
             best_of,
             temperature,
             repetition_penalty,
+            frequency_penalty,
             top_k,
             top_p,
             typical_p,
@@ -146,6 +211,8 @@ impl Validation {
             watermark,
             decoder_input_details,
             top_n_tokens,
+            grammar,
+            adapter_id,
             ..
         } = request.parameters;
 
@@ -171,6 +238,11 @@ impl Validation {
             return Err(ValidationError::RepetitionPenalty);
         }
 
+        let frequency_penalty = frequency_penalty.unwrap_or(0.0);
+        if !(-2.0..=2.0).contains(&frequency_penalty) {
+            return Err(ValidationError::FrequencyPenalty);
+        }
+
         // Different because the proto default value is not a valid value
         // for the user
         let top_p = top_p
@@ -200,7 +272,7 @@ impl Validation {
             })
             .unwrap_or(Ok(0))?;
 
-        if max_new_tokens == 0 {
+        if max_new_tokens == Some(0) {
             return Err(ValidationError::NegativeMaxNewTokens);
         }
 
@@ -247,21 +319,66 @@ impl Validation {
             .unwrap_or(Ok(None))?;
 
         // Validate inputs
-        let (inputs, input_length) = self
+        let (inputs, input_length, max_new_tokens) = self
             .validate_input(request.inputs, truncate, max_new_tokens)
             .await?;
 
-        let parameters = NextTokenChooserParameters {
+        // TODO: we should build the FSM here and pass the compiled FSM instead of the grammar
+        // NOTE: this is currently difficult because we need the tokenizer in Python to build
+        // the FSM and we'd have to load a copy of the tokenizer into our Pyo3 instance which
+        // may be slow and memory intensive. Best case is to have a Rust implementation of the FSM
+        // compiler and use that to build the FSM here.
+
+        // Validate grammar and unpack the grammar and type for the proto message
+        let grammar = match grammar {
+            Some(grammar) => {
+                // Ensure that grammar is not set if it's not supported
+                if self.disable_grammar_support {
+                    return Err(ValidationError::Grammar);
+                }
+                let valid_grammar = match grammar {
+                    GrammarType::Json(json) => {
+                        let json = match json {
+                            // if value is a string, we need to parse it again to make sure its
+                            // a valid json
+                            Value::String(s) => serde_json::from_str(&s)
+                                .map_err(|e| ValidationError::InvalidGrammar(e.to_string())),
+                            Value::Object(_) => Ok(json),
+                            _ => Err(ValidationError::Grammar),
+                        }?;
+
+                        // Check if the json is a valid JSONSchema
+                        JSONSchema::options()
+                            .with_draft(Draft::Draft202012)
+                            .compile(&json)
+                            .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;
+
+                        // Serialize json to string
+                        ValidGrammar::Json(
+                            serde_json::to_string(&json)
+                                .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?,
+                        )
+                    }
+                    GrammarType::Regex(regex) => ValidGrammar::Regex(regex),
+                };
+                Some(valid_grammar)
+            }
+            None => None,
+        };
+
+        let parameters = ValidParameters {
             temperature,
             repetition_penalty,
+            frequency_penalty,
             top_k,
             top_p,
             typical_p,
             do_sample,
             seed,
             watermark,
+            grammar,
         };
-        let stopping_parameters = StoppingCriteriaParameters {
+        let stopping_parameters = ValidStoppingParameters {
             max_new_tokens,
             stop_sequences,
             ignore_eos_token: false,
@@ -276,7 +393,8 @@ impl Validation {
             truncate: truncate.unwrap_or(self.max_input_length) as u32,
             parameters,
             stopping_parameters,
-            top_n_tokens: top_n_tokens,
+            top_n_tokens,
+            adapter_id,
         })
     }
 
@@ -295,62 +413,271 @@ impl Validation {
     }
 }
 
+/// Round robin tokenization task
+async fn round_robin_task(
+    mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
+    senders: Vec<mpsc::UnboundedSender<TokenizerRequest>>,
+) {
+    loop {
+        for sender in &senders {
+            match receiver.recv().await {
+                None => return,
+                Some(request) => sender.send(request).unwrap(),
+            };
+        }
+    }
+}
+
 /// Start tokenization workers
-fn tokenizer_worker(tokenizer: Tokenizer, receiver: flume::Receiver<TokenizerRequest>) {
+fn tokenizer_worker(
+    tokenizer: Tokenizer,
+    config: Option<Config>,
+    preprocessor_config: Option<HubPreprocessorConfig>,
+    mut receiver: mpsc::UnboundedReceiver<TokenizerRequest>,
+) {
     // Loop over requests
-    while let Ok(((inputs, truncate), response_tx, parent_span)) = receiver.recv() {
+    while let Some(((inputs, truncate), response_tx, parent_span)) = receiver.blocking_recv() {
         parent_span.in_scope(|| {
             response_tx
-                .send(prepare_input(inputs, truncate, &tokenizer))
+                .send(prepare_input(
+                    inputs,
+                    truncate,
+                    &tokenizer,
+                    config.as_ref(),
+                    preprocessor_config.as_ref(),
+                ))
                 .unwrap_or(())
         })
     }
 }
 
+fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
+    match mimetype {
+        "image/png" => Some(ImageFormat::Png),
+        "image/jpeg" => Some(ImageFormat::Jpeg),
+        "image/jpg" => Some(ImageFormat::Jpeg),
+        "image/gif" => Some(ImageFormat::Gif),
+        "image/webp" => Some(ImageFormat::WebP),
+        "image/tiff" => Some(ImageFormat::Tiff),
+        // "image/pnm"=>Some(ImageFormat::Pnm),
+        // "image/tga"=>Some(ImageFormat::Tga),
+        // "image/dds"=>Some(ImageFormat::Dds),
+        // "image/bmp"=>Some(ImageFormat::Bmp),
+        // "image/ico"=>Some(ImageFormat::Ico),
+        // "image/x-exr"=>Some(ImageFormat::OpenExr),
+        _ => None,
+    }
+}
+
+fn format_to_mimetype(format: ImageFormat) -> String {
+    match format {
+        ImageFormat::Png => "image/png",
+        ImageFormat::Jpeg => "image/jpeg",
+        ImageFormat::Gif => "image/gif",
+        ImageFormat::WebP => "image/webp",
+        ImageFormat::Tiff => "image/tiff",
+        _ => "application/octet-stream",
+    }
+    .to_string()
+}
+
+fn fetch_image(input: &str) -> Result<(Vec<u8>, String, usize, usize), ValidationError> {
+    if input.starts_with("![](http://") || input.starts_with("![](https://") {
+        let url = &input["![](".len()..input.len() - 1];
+        let data = reqwest::blocking::get(url)?.bytes()?;
+
+        let format = image::guess_format(&data)?;
+        // TODO Remove this clone
+        let img = ImageReader::with_format(Cursor::new(data.clone()), format).decode()?;
+        let height: usize = img.height().try_into()?;
+        let width: usize = img.width().try_into()?;
+        let mimetype = format_to_mimetype(format);
+        Ok((data.to_vec(), mimetype, height, width))
+    } else if input.starts_with("![](data:") {
+        // Remove ![](....)
+        let content = &input["![](data:".len()..input.len() - 1];
+        let tokens: Vec<_> = content.split(';').collect();
+        if tokens.len() != 2 {
+            return Err(ValidationError::InvalidImageContent(content.to_string()));
+        }
+        let mimetype = tokens[0];
+        let content = tokens[1];
+
+        if !content.starts_with("base64,") {
+            return Err(ValidationError::InvalidImageContent(content.to_string()));
+        }
+
+        let data = STANDARD.decode(content["base64,".len()..].as_bytes())?;
+        let img = if let Some(format) = format_from_mimetype(mimetype) {
+            ImageReader::with_format(Cursor::new(&data), format).decode()?
+        } else {
+            ImageReader::new(Cursor::new(&data))
+                .with_guessed_format()
+                .map_err(|_io_error| ValidationError::InvalidImageContent(content.to_string()))?
+                .decode()?
+        };
+
+        let height: usize = img.height().try_into()?;
+        let width: usize = img.width().try_into()?;
+        Ok((data, mimetype.to_string(), height, width))
+    } else {
+        Err(ValidationError::InvalidImageContent(input.to_string()))
+    }
+}
+
+fn image_tokens(
+    config: &Config,
+    preprocessor_config: Option<&HubPreprocessorConfig>,
+    height: usize,
+    width: usize,
+) -> String {
+    use Config::*;
+    use HubPreprocessorConfig::*;
+    match config {
+        Idefics => "<image>".to_string(),
+        Idefics2(config) => {
+            const FAKE: &str = "<fake_token_around_image>";
+            const IMAGE: &str = "<image>";
+
+            let slots = config.get_number_of_features(height, width);
+
+            let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
+            image_string.push_str(FAKE);
+            image_string.extend(iter::repeat(IMAGE).take(slots));
+            image_string.push_str(FAKE);
+
+            if matches!(
+                preprocessor_config,
+                Some(Idefics2Processor(Idefics2Preprocessor {
+                    do_image_splitting: true,
+                    ..
+                }))
+            ) {
+                image_string = image_string.repeat(5);
+            };
+
+            image_string
+        }
+        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
+        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
+        _ => unimplemented!("Images tokens are not supported for this model configuration"),
+    }
+}
+
+fn image_tokens_fixup(config: &Config, text: String) -> String {
+    match config {
+        Config::Idefics2(_) => {
+            const FAKE: &str = "<fake_token_around_image>";
+            text.replace(&format!("{FAKE}{FAKE}"), FAKE)
+        }
+        _ => text,
+    }
+}
+
 /// Get input length and optionally truncate it
 fn prepare_input(
     inputs: String,
-    truncate: Option<usize>,
+    _truncate: Option<usize>,
     tokenizer: &Tokenizer,
-) -> Result<(String, usize), ValidationError> {
-    // Get the number of tokens in the input
-    let mut encoding = tokenizer
-        .encode(inputs.clone(), true)
-        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
+    config: Option<&Config>,
+    preprocessor_config: Option<&HubPreprocessorConfig>,
+) -> Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError> {
+    use Config::*;
+    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
+    let (tokenizer_query, input_chunks) = match config {
+        Some(config @ (Idefics | Idefics2(_) | Paligemma(_) | LlavaNext(_))) => {
+            let mut input_chunks = Vec::new();
+            let mut tokenizer_query = String::with_capacity(inputs.len());
+            let mut start = 0;
+            for chunk in RE.find_iter(&inputs) {
+                let chunk_start = chunk.start();
+                let chunk_end = chunk.end();
+                if chunk_start != start {
+                    input_chunks.push(Chunk::Text(inputs[start..chunk_start].to_string()).into());
+                    tokenizer_query.push_str(&inputs[start..chunk_start]);
+                }
+                let (data, mimetype, height, width) = fetch_image(&inputs[chunk_start..chunk_end])?;
+                input_chunks.push(Chunk::Image(Image { data, mimetype }).into());
+                tokenizer_query.push_str(&image_tokens(config, preprocessor_config, height, width));
+                start = chunk_end;
+            }
+            if start != inputs.len() {
+                input_chunks.push(Chunk::Text(inputs[start..].to_string()).into());
+                tokenizer_query.push_str(&inputs[start..]);
+            }
 
-    // Optionally truncate
-    let (inputs, input_length) = match truncate {
-        // Truncate is some and < encoding length
-        Some(truncate) if truncate < encoding.len() => {
-            // truncate encoding and decode new inputs
-            encoding.truncate(truncate, 0, TruncationDirection::Left);
-            let inputs = tokenizer
-                .decode(encoding.get_ids(), false)
-                .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
-            (inputs, encoding.len())
+            tokenizer_query = image_tokens_fixup(config, tokenizer_query);
+
+            (tokenizer_query, input_chunks)
         }
-        // Nothing to do
-        _ => (inputs, encoding.len()),
+        _ => (inputs.clone(), vec![Chunk::Text(inputs).into()]),
     };
 
-    Ok((inputs, input_length))
+    // Get the number of tokens in the input
+    let encoding = tokenizer
+        .encode(tokenizer_query, true)
+        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
+
+    Ok((encoding, input_chunks))
 }
 
 type TokenizerRequest = (
     (String, Option<usize>),
-    oneshot::Sender<Result<(String, usize), ValidationError>>,
+    oneshot::Sender<Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError>>,
     Span,
 );
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
+pub(crate) enum ValidGrammar {
+    Json(String),
+    Regex(String),
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct ValidParameters {
+    /// / exponential scaling output probability distribution
+    pub temperature: f32,
+    /// / restricting to the k highest probability elements
+    pub top_k: u32,
+    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    pub top_p: f32,
+    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    pub typical_p: f32,
+    /// / apply sampling on the logits
+    pub do_sample: bool,
+    /// / random seed for sampling
+    pub seed: u64,
+    /// / repetition penalty
+    pub repetition_penalty: f32,
+    /// / frequency penalty
+    pub frequency_penalty: f32,
+    /// / token watermarking using "A Watermark for Large Language Models"
+    pub watermark: bool,
+    /// / grammar (applied if not empty)
+    pub grammar: Option<ValidGrammar>,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct ValidStoppingParameters {
+    /// / Maximum number of generated tokens
+    pub max_new_tokens: u32,
+    /// / Optional stopping sequences
+    pub stop_sequences: Vec<String>,
+    /// / Ignore end of sequence token
+    /// / used for benchmarking
+    pub ignore_eos_token: bool,
+}
+
+#[derive(Debug, Clone)]
 pub(crate) struct ValidGenerateRequest {
-    pub inputs: String,
+    pub inputs: Vec<InputChunk>,
     pub input_length: u32,
     pub truncate: u32,
     pub decoder_input_details: bool,
-    pub parameters: NextTokenChooserParameters,
-    pub stopping_parameters: StoppingCriteriaParameters,
+    pub parameters: ValidParameters,
+    pub stopping_parameters: ValidStoppingParameters,
     pub top_n_tokens: u32,
+    pub adapter_id: Option<String>,
 }
 
 #[derive(Error, Debug)]
@@ -375,6 +702,8 @@ pub enum ValidationError {
     Temperature,
     #[error("`repetition_penalty` must be strictly positive")]
     RepetitionPenalty,
+    #[error("`frequency_penalty` must be >= -2.0 and <= 2.0")]
+    FrequencyPenalty,
     #[error("`top_p` must be > 0.0 and < 1.0")]
     TopP,
     #[error("`top_k` must be strictly positive")]
@@ -383,6 +712,8 @@ pub enum ValidationError {
     Truncate(usize, usize),
     #[error("`typical_p` must be > 0.0 and < 1.0")]
     TypicalP,
+    #[error("one of `max_new_tokens` or `truncate` must be set if a fast tokenizer is not in use")]
+    UnsetMaxNewTokens,
     #[error("`max_new_tokens` must be strictly positive")]
     NegativeMaxNewTokens,
     #[error("`max_new_tokens` must be <= {0}. Given: {1}")]
@@ -397,11 +728,26 @@ pub enum ValidationError {
     StopSequence(usize, usize),
     #[error("tokenizer error {0}")]
     Tokenizer(String),
+    #[error("grammar is not supported")]
+    Grammar,
+    #[error("grammar is not valid: {0}")]
+    InvalidGrammar(String),
+    #[error("base64 encoding is invalid: {0}")]
+    InvalidBase64(#[from] base64::DecodeError),
+    #[error("invalid image: {0}")]
+    InvalidImage(#[from] image::ImageError),
+    #[error("invalid integer: {0}")]
+    InvalidInt(#[from] core::num::TryFromIntError),
+    #[error("invalid image content: {0}")]
+    InvalidImageContent(String),
+    #[error("Could not fetch image: {0}")]
+    FailedFetchImage(#[from] reqwest::Error),
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::config::{Idefics2, PaliTextConfig, Paligemma};
     use crate::default_parameters;
     use crate::tests::get_tokenizer;
 
@@ -414,23 +760,29 @@ mod tests {
         let max_input_length = 5;
         let max_total_tokens = 6;
         let workers = 1;
+        let disable_grammar_support = true;
+        let config = None;
         let validation = Validation::new(
             workers,
             tokenizer,
+            config,
+            None,
             max_best_of,
             max_stop_sequence,
             max_top_n_tokens,
             max_input_length,
             max_total_tokens,
+            disable_grammar_support,
         );
 
         let max_new_tokens = 10;
         match validation
-            .validate_input("Hello".to_string(), None, max_new_tokens)
+            .validate_input("Hello".to_string(), None, Some(max_new_tokens))
             .await
         {
-            Err(ValidationError::MaxNewTokens(1, 10)) => (),
-            _ => panic!("Unexpected not max new tokens"),
+            // Err(ValidationError::MaxNewTokens(1, 10)) => (),
+            Ok((_s, 0, 10)) => (),
+            r => panic!("Unexpected not max new tokens: {r:?}"),
         }
     }
 
@@ -442,20 +794,25 @@ mod tests {
         let max_top_n_tokens = 4;
         let max_input_length = 5;
         let max_total_tokens = 6;
+        let disable_grammar_support = true;
         let workers = 1;
+        let config = None;
         let validation = Validation::new(
             workers,
             tokenizer,
+            config,
+            None,
             max_best_of,
             max_stop_sequence,
             max_top_n_tokens,
             max_input_length,
             max_total_tokens,
+            disable_grammar_support,
         );
 
         let max_new_tokens = 10;
         match validation
-            .validate_input("Hello".to_string(), None, max_new_tokens)
+            .validate_input("Hello".to_string(), None, Some(max_new_tokens))
             .await
         {
             Err(ValidationError::MaxTotalTokens(6, 1, 10)) => (),
@@ -472,14 +829,19 @@ mod tests {
         let max_input_length = 5;
         let max_total_tokens = 6;
         let workers = 1;
+        let disable_grammar_support = true;
+        let config = None;
         let validation = Validation::new(
             workers,
             tokenizer,
+            config,
+            None,
             max_best_of,
             max_stop_sequence,
             max_top_n_tokens,
             max_input_length,
             max_total_tokens,
+            disable_grammar_support,
         );
         match validation
             .validate(GenerateRequest {
@@ -504,22 +866,28 @@ mod tests {
         let max_stop_sequence = 3;
         let max_top_n_tokens = 4;
         let max_input_length = 5;
-        let max_total_tokens = 6;
+        let max_total_tokens = 106;
         let workers = 1;
+        let disable_grammar_support = true;
+        let config = None;
         let validation = Validation::new(
             workers,
             tokenizer,
+            config,
+            None,
             max_best_of,
             max_stop_sequence,
             max_top_n_tokens,
             max_input_length,
             max_total_tokens,
+            disable_grammar_support,
         );
         match validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_p: Some(1.0),
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -534,7 +902,7 @@ mod tests {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_p: Some(0.99),
-                    max_new_tokens: 1,
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -549,7 +917,7 @@ mod tests {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_p: None,
-                    max_new_tokens: 1,
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -566,22 +934,28 @@ mod tests {
         let max_stop_sequences = 3;
         let max_top_n_tokens = 4;
         let max_input_length = 5;
-        let max_total_tokens = 6;
+        let max_total_tokens = 106;
         let workers = 1;
+        let disable_grammar_support = true;
+        let config = None;
         let validation = Validation::new(
             workers,
             tokenizer,
+            config,
+            None,
             max_best_of,
             max_stop_sequences,
             max_top_n_tokens,
             max_input_length,
             max_total_tokens,
+            disable_grammar_support,
         );
         match validation
             .validate(GenerateRequest {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_n_tokens: Some(5),
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -596,7 +970,7 @@ mod tests {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_n_tokens: Some(4),
-                    max_new_tokens: 1,
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -608,7 +982,7 @@ mod tests {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_n_tokens: Some(0),
-                    max_new_tokens: 1,
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -620,7 +994,7 @@ mod tests {
                 inputs: "Hello".to_string(),
                 parameters: GenerateParameters {
                     top_n_tokens: None,
-                    max_new_tokens: 1,
+                    max_new_tokens: Some(5),
                     ..default_parameters()
                 },
             })
@@ -629,4 +1003,141 @@ mod tests {
 
         assert_eq!(valid_request.top_n_tokens, 0);
     }
+
+    static PIXEL_GIF: &str = "R0lGODdhAQABAIEAAP///wAAAAAAAAAAACwAAAAAAQABAAAIBAABBAQAOw==";
+
+    #[tokio::test]
+    async fn test_prepare_input_chunks() {
+        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();
+
+        let tokenizer = Some(get_tokenizer().await);
+
+        let max_best_of = 2;
+        let max_stop_sequence = 3;
+        let max_top_n_tokens = 4;
+        let max_input_length = 5;
+        let max_total_tokens = 6;
+        let disable_grammar_support = true;
+        let workers = 1;
+        let config = Config::Paligemma(Paligemma {
+            text_config: PaliTextConfig {
+                num_image_tokens: 1,
+            },
+        });
+        let validation = Validation::new(
+            workers,
+            tokenizer,
+            Some(config),
+            None,
+            max_best_of,
+            max_stop_sequence,
+            max_top_n_tokens,
+            max_input_length,
+            max_total_tokens,
+            disable_grammar_support,
+        );
+
+        let chunks = match validation
+            .tokenize(
+                format!("test![](data:image/gif;base64,{})", PIXEL_GIF),
+                None,
+            )
+            .await
+        {
+            Ok(Some((_encoding, chunks))) => chunks,
+            _ => panic!("Unexpected tokenization failure"),
+        };
+
+        assert!(
+            chunks
+                == vec![
+                    Chunk::Text("test".to_string()).into(),
+                    Chunk::Image(Image {
+                        data: pixel_data.clone(),
+                        mimetype: "image/gif".to_string()
+                    })
+                    .into()
+                ],
+            "Failed to process images",
+        );
+    }
+
+    #[tokio::test]
+    async fn test_idefics2_correct_n_fake_tokens() {
+        let pixel_data = STANDARD.decode(PIXEL_GIF).unwrap();
+
+        let tokenizer = Some(get_tokenizer().await);
+
+        let max_best_of = 2;
+        let max_stop_sequence = 3;
+        let max_top_n_tokens = 4;
+        let max_input_length = 5;
+        let max_total_tokens = 6;
+        let disable_grammar_support = true;
+        let workers = 1;
+        let config = Config::Idefics2(Idefics2 {});
+        let validation = Validation::new(
+            workers,
+            tokenizer,
+            Some(config),
+            Some(HubPreprocessorConfig::Idefics2Processor(
+                Idefics2Preprocessor {
+                    do_image_splitting: true,
+                },
+            )),
+            max_best_of,
+            max_stop_sequence,
+            max_top_n_tokens,
+            max_input_length,
+            max_total_tokens,
+            disable_grammar_support,
+        );
+
+        let (encoding, chunks) = match validation
+            .tokenize(
+                format!(
+                    "test![](data:image/gif;base64,{})![](data:image/gif;base64,{})",
+                    PIXEL_GIF, PIXEL_GIF
+                ),
+                None,
+            )
+            .await
+        {
+            Ok(Some((encoding, chunks))) => (encoding, chunks),
+            _ => panic!("Unexpected tokenization failure"),
+        };
+
+        assert!(
+            chunks
+                == vec![
+                    Chunk::Text("test".to_string()).into(),
+                    Chunk::Image(Image {
+                        data: pixel_data.clone(),
+                        mimetype: "image/gif".to_string()
+                    })
+                    .into(),
+                    Chunk::Image(Image {
+                        data: pixel_data.clone(),
+                        mimetype: "image/gif".to_string()
+                    })
+                    .into()
+                ],
+            "Failed to process images",
+        );
+
+        // Verify the number of fake tokens:
+        //
+        // - Two images surrounded/separated by a fake token = 3.
+        // - Both are split in 5 subimages, separated by a fake token: 2 * 4
+        //
+        // Fake tokens get split up by the testing tokenizer, but we don't care.
+        assert_eq!(
+            encoding
+                .get_tokens()
+                .iter()
+                .filter(|t| *t == "fake")
+                .count(),
+            11
+        );
+    }
 }
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 2db1883c..8c77896e 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,3 +1,5 @@
 [toolchain]
-channel = "1.70.0"
-components = ["rustfmt", "clippy"]
\ No newline at end of file
+# Released on: June 13, 2024
+# https://releases.rs/docs/1.79.0/
+channel = "1.79.0"
+components = ["rustfmt", "clippy"]
diff --git a/server/.gitignore b/server/.gitignore
index 2e1db124..576746ee 100644
--- a/server/.gitignore
+++ b/server/.gitignore
@@ -159,3 +159,6 @@ safetensors
 flash-attention/
 flash-attention-v2/
 vllm/
+llm-awq/
+eetq/
+mamba/
diff --git a/server/Makefile b/server/Makefile
index a4ce6d8b..0099c56a 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -1,30 +1,39 @@
 include Makefile-flash-att
 include Makefile-flash-att-v2
 include Makefile-vllm
+include Makefile-awq
+include Makefile-eetq
+include Makefile-selective-scan
+include Makefile-lorax-punica
 
 unit-tests:
 	pytest -s -vv -m "not private" tests
 
 gen-server:
 	# Compile protos
-	pip install grpcio-tools==1.51.1 mypy-protobuf==3.4.0 'types-protobuf>=3.20.4' --no-cache-dir
+	pip install grpcio-tools==1.62.2 mypy-protobuf==3.6.0 'types-protobuf' --no-cache-dir
 	mkdir text_generation_server/pb || true
-	python -m grpc_tools.protoc -I../proto --python_out=text_generation_server/pb \
-		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/generate.proto
+	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
+		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
 	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
 	touch text_generation_server/pb/__init__.py
 
-install-torch:
-	# Install specific version of torch
-	pip install torch --extra-index-url https://download.pytorch.org/whl/cu118 --no-cache-dir
-
-install: gen-server install-torch
+install-server: gen-server
 	pip install pip --upgrade
-	pip install -r requirements.txt
-	pip install -e ".[bnb, accelerate]"
+	pip install -r requirements_cuda.txt
+	pip install -e ".[bnb, accelerate, quantize, peft, outlines]"
+
+
+install: install-cuda
+	echo "Installed server"
+
+install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
+
+install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
 
 run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
 
 export-requirements:
-	poetry export -o requirements.txt -E bnb -E quantize --without-hashes
+	poetry export -o requirements_cuda.txt --without-hashes
+	poetry export -o requirements_rocm.txt --without-hashes
diff --git a/server/Makefile-awq b/server/Makefile-awq
new file mode 100644
index 00000000..4e074a13
--- /dev/null
+++ b/server/Makefile-awq
@@ -0,0 +1,15 @@
+# Fork that adds only the correct stream to this kernel in order
+# to make cuda graphs work.
+awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4
+
+awq:
+	rm -rf llm-awq
+	git clone https://github.com/huggingface/llm-awq
+
+build-awq: awq
+	cd llm-awq/ && git fetch && git checkout $(awq_commit)
+	cd llm-awq/awq/kernels && python setup.py build
+
+install-awq: build-awq
+	pip uninstall awq_inference_engine -y || true
+	cd llm-awq/awq/kernels && python setup.py install
diff --git a/server/Makefile-eetq b/server/Makefile-eetq
new file mode 100644
index 00000000..726e47b5
--- /dev/null
+++ b/server/Makefile-eetq
@@ -0,0 +1,13 @@
+eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0
+
+eetq:
+    # Clone eetq
+	pip install packaging
+	git clone https://github.com/NetEase-FuXi/EETQ.git eetq
+
+build-eetq: eetq
+	cd eetq && git fetch && git checkout $(eetq_commit) && git submodule update --init --recursive
+	cd eetq && python setup.py build
+
+install-eetq: build-eetq
+	cd eetq && python setup.py install
diff --git a/server/Makefile-flash-att b/server/Makefile-flash-att
index bc1d37ef..29e75bc4 100644
--- a/server/Makefile-flash-att
+++ b/server/Makefile-flash-att
@@ -1,16 +1,12 @@
 flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
 
-flash-attention:
-    # Clone flash attention
-	pip install packaging
-	git clone https://github.com/HazyResearch/flash-attention.git
-
-build-flash-attention: flash-attention
-	cd flash-attention && git fetch && git checkout $(flash_att_commit)
-	cd flash-attention && python setup.py build
-	cd flash-attention/csrc/rotary && python setup.py build
-	cd flash-attention/csrc/layer_norm && python setup.py build
+build-flash-attention:
+	if [ ! -d 'flash-attention' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/HazyResearch/flash-attention.git; \
+	fi
+	cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
+	MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build
 
 install-flash-attention: build-flash-attention
-	pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
-	cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
\ No newline at end of file
+	cd flash-attention && git checkout $(flash_att_commit) && MAX_JOBS=8 python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
diff --git a/server/Makefile-flash-att-v2 b/server/Makefile-flash-att-v2
index a7d63356..ba90a74d 100644
--- a/server/Makefile-flash-att-v2
+++ b/server/Makefile-flash-att-v2
@@ -1,13 +1,21 @@
-flash_att_v2_commit := 4f285b354796fb17df8636485b9a04df3ebbb7dc
+flash_att_v2_commit_cuda := v2.5.9.post1
+flash_att_v2_commit_rocm := 2554f490101742ccdc56620a938f847f61754be6
 
-flash-attention-v2:
-    # Clone flash attention
-	pip install packaging
-	git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2
+build-flash-attention-v2-cuda:
+	pip install -U packaging wheel
+	pip install flash-attn==$(flash_att_v2_commit_cuda)
 
-build-flash-attention-v2: flash-attention-v2
-	cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit)
-	cd flash-attention-v2 && python setup.py build
+install-flash-attention-v2-cuda: build-flash-attention-v2-cuda
+	echo "Flash v2 installed"
 
-install-flash-attention-v2: build-flash-attention-v2
-	cd flash-attention-v2 && python setup.py install
\ No newline at end of file
+build-flash-attention-v2-rocm:
+	if [ ! -d 'flash-attention-v2' ]; then \
+		pip install -U packaging ninja  --no-cache-dir && \
+		git clone https://github.com/ROCm/flash-attention.git flash-attention-v2 && \
+		cd flash-attention-v2 && git fetch && git checkout $(flash_att_v2_commit_rocm) && \
+		git submodule update --init --recursive && GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build; \
+	fi
+
+install-flash-attention-v2-rocm: build-flash-attention-v2-rocm
+	cd flash-attention-v2 &&  \
+	GPU_ARCHS="gfx90a;gfx942" PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py install
diff --git a/server/Makefile-lorax-punica b/server/Makefile-lorax-punica
new file mode 100644
index 00000000..72f06f76
--- /dev/null
+++ b/server/Makefile-lorax-punica
@@ -0,0 +1,12 @@
+lorax_punica_commit := c71861a653412267dc27ec86013dd945ce3474bc
+
+build-lorax-punica:
+	if [ ! -d 'lorax-punica' ]; then \
+		git clone --no-checkout https://github.com/predibase/lorax.git lorax-punica; \
+	fi
+	cd lorax-punica && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
+	cd lorax-punica && git submodule update --init --recursive
+	cd lorax-punica/server/punica_kernels && python setup.py build
+
+install-lorax-punica: build-lorax-punica
+	cd lorax-punica/server/punica_kernels && python setup.py install
diff --git a/server/Makefile-selective-scan b/server/Makefile-selective-scan
new file mode 100644
index 00000000..b93b517d
--- /dev/null
+++ b/server/Makefile-selective-scan
@@ -0,0 +1,28 @@
+selective_scan_commit := 2a3704fd47ba817b415627b06fd796b971fdc137
+
+causal-conv1d:
+	rm -rf causal-conv1d
+	git clone https://github.com/Dao-AILab/causal-conv1d.git
+
+build-causal-conv1d: causal-conv1d
+	cd causal-conv1d/ && git checkout v1.1.1 # known latest working version tag
+	cd causal-conv1d/ && CAUSAL_CONV1D_FORCE_BUILD=TRUE python setup.py build
+
+install-causal-conv1d: build-causal-conv1d
+	pip uninstall causal-conv1d -y || true
+	cd causal-conv1d/ && pip install .
+
+# selective-scan dependends on causal-conv1d
+selective-scan:
+	rm -rf mamba
+	git clone https://github.com/state-spaces/mamba.git mamba
+
+build-selective-scan: selective-scan
+	cd mamba/ && git fetch && git checkout $(selective_scan_commit)
+	cd mamba && python setup.py build
+
+install-selective-scan: install-causal-conv1d build-selective-scan
+	pip uninstall selective-scan-cuda -y || true
+	cd mamba && pip install .
+
+build-all: build-causal-conv1d build-selective-scan
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
index af750733..2f2b5ef6 100644
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@@ -1,13 +1,23 @@
-vllm_commit := d284b831c17f42a8ea63369a06138325f73c4cf9
+commit_cuda := b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
+commit_rocm := c6ee53b1be97e3bbc791b95f22827501297f8921
+build-vllm-cuda:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/Narsil/vllm.git vllm; \
+	fi
+	cd vllm  && git fetch && git checkout $(commit_cuda) && python setup.py build
 
-vllm:
-    # Clone vllm
-	git clone https://github.com/OlivierDehaene/vllm.git
+install-vllm-cuda: build-vllm-cuda
+	cd vllm  && git fetch && git checkout $(commit_cuda) && pip install -e .
 
-build-vllm: vllm
-	cd vllm && git fetch && git checkout $(vllm_commit)
-	cd vllm && python setup.py build
+build-vllm-rocm:
+	if [ ! -d 'vllm' ]; then \
+		pip install -U ninja packaging --no-cache-dir && \
+		git clone https://github.com/fxmarty/rocm-vllm.git vllm; \
+	fi
+	cd vllm && git fetch && git checkout $(commit_rocm) &&  \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" python setup.py build
 
-install-vllm: build-vllm
-	pip uninstall vllm -y || true
-	cd vllm && python setup.py install
\ No newline at end of file
+install-vllm-rocm: build-vllm-rocm
+	cd vllm && git fetch && git checkout $(commit_rocm) && \
+	PYTORCH_ROCM_ARCH="gfx90a;gfx942" pip install -e .
diff --git a/server/README.md b/server/README.md
index 8efd80ac..b8208f9e 100644
--- a/server/README.md
+++ b/server/README.md
@@ -12,4 +12,4 @@ make install
 
 ```shell
 make run-dev
-```
\ No newline at end of file
+```
diff --git a/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu b/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
index 4be547b1..8206c3e0 100644
--- a/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
+++ b/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
@@ -247,4 +247,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         &forward,
         "Bloom attention mechanism forward (CUDA)"
     );
-}
\ No newline at end of file
+}
diff --git a/server/custom_kernels/setup.py b/server/custom_kernels/setup.py
index 43b8ee4e..69f6b72a 100644
--- a/server/custom_kernels/setup.py
+++ b/server/custom_kernels/setup.py
@@ -1,5 +1,10 @@
 from setuptools import setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import torch
+
+extra_compile_args = ["-std=c++17"]
+if not torch.version.hip:
+    extra_compile_args.append("-arch=compute_80")
 
 setup(
     name="custom_kernels",
@@ -7,12 +12,12 @@ setup(
         CUDAExtension(
             name="custom_kernels.fused_bloom_attention_cuda",
             sources=["custom_kernels/fused_bloom_attention_cuda.cu"],
-            extra_compile_args=["-arch=compute_80", "-std=c++17"],
+            extra_compile_args=extra_compile_args,
         ),
         CUDAExtension(
             name="custom_kernels.fused_attention_cuda",
             sources=["custom_kernels/fused_attention_cuda.cu"],
-            extra_compile_args=["-arch=compute_80", "-std=c++17"],
+            extra_compile_args=extra_compile_args,
         ),
     ],
     cmdclass={"build_ext": BuildExtension},
diff --git a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh b/server/exllama_kernels/exllama_kernels/cu_compat.cuh
similarity index 91%
rename from server/exllama_kernels/exllama_kernels/cuda_compat.cuh
rename to server/exllama_kernels/exllama_kernels/cu_compat.cuh
index 8dfa25de..c5258813 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh
+++ b/server/exllama_kernels/exllama_kernels/cu_compat.cuh
@@ -43,12 +43,12 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 
 //
 
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ < 700
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
 
 __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 
-#if __CUDA_ARCH__ < 600
+#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
 __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
 #endif
 
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
index 6571c17d..0364e38c 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
@@ -16,4 +16,4 @@ void column_remap_cuda
     const uint32_t* x_map
 );
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
index 60dc4c9d..1b0f7956 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
@@ -1,9 +1,13 @@
 #include "q4_matmul.cuh"
 #include "column_remap.cuh"
+#include <ATen/cuda/CUDAContext.h>
 #include "../util.cuh"
 #include "../matrix.cuh"
-#include "../cuda_compat.cuh"
+#include "../cu_compat.cuh"
 #include "../cuda_buffers.cuh"
+#if defined(USE_ROCM)
+#include "../hip_compat.cuh"
+#endif
 
 const int THREADS_X = 32;       // Block size and thread count along columns in w and out
 const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
@@ -82,7 +86,7 @@ __global__ void q4_matmul_kernel
             if constexpr (use_half2)
             {
                 half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;
 
                 if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
                 else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
@@ -90,7 +94,7 @@ __global__ void q4_matmul_kernel
             else
             {
                 half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;
 
                 if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
                 else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
@@ -107,7 +111,7 @@ __global__ void q4_matmul_kernel
             {
                 int group = k / groupsize;
                 half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;
 
                 if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
                 else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
@@ -116,7 +120,7 @@ __global__ void q4_matmul_kernel
             {
                 int group = k / groupsize;
                 half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
+                uint32_t w_zero = (w_zeros_.item(group, w_column) + 1) & 0x0F;
 
                 if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
                 else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
@@ -128,7 +132,7 @@ __global__ void q4_matmul_kernel
 
     if constexpr (use_half2)
     {
-        half result = __hadd(acc.x, acc.y);
+        half result = __hadd(__low2half(acc), __high2half(acc));
         atomicAdd(out_.item_ptr(x_row, w_column), result);
     }
     else
@@ -221,8 +225,8 @@ void q4_matmul_recons_cuda
     const int x_height,
     Q4Matrix* w,
     half* out,
-    const cublasHandle_t handle,
-    bool no_zero
+    bool no_zero,
+    const cublasHandle_t handle
 )
 {
     int height = x_height;
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
index 63611790..4c7a6669 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
@@ -19,8 +19,8 @@ void q4_matmul_cuda
     const int x_height,
     const Q4Matrix* w,
     half* out,
-    bool no_zero = false,
-    cudaStream_t alt_stream = NULL
+    bool no_zero,
+    cudaStream_t alt_stream
 );
 
 void q4_matmul_recons_cuda
@@ -30,8 +30,8 @@ void q4_matmul_recons_cuda
     const int x_height,
     Q4Matrix* w,
     half* out,
-    const cublasHandle_t handle,
-    bool no_zero = false
+    bool no_zero,
+    const cublasHandle_t handle
 );
 
 #endif
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
index f3d1564f..1f32e6b8 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
@@ -1,5 +1,6 @@
 // Adapted from turboderp exllama: https://github.com/turboderp/exllama
 
+#include <ATen/cuda/CUDAContext.h>
 #include "q4_matrix.cuh"
 #include <vector>
 #include "../util.cuh"
@@ -90,7 +91,7 @@ __global__ void make_sequential_kernel
         int w2_row_shift = w2_subrow << 2;
         int wnew2_row_shift = i << 2;
 
-        uint64_t src = w2[w2_row * w2_stride + w2_column];
+    uint64_t src = w2[w2_row * w2_stride + w2_column];
         src >>= w2_row_shift;
         src &= 0x0000000f0000000f;
         src <<= wnew2_row_shift;
@@ -146,7 +147,8 @@ void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
     dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
     dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1);
 
-    make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    make_sequential_kernel<<<blocks, threads, 0, stream>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
 
     // Replace qweights
 
@@ -189,7 +191,7 @@ __global__ void reconstruct_kernel
     int group = row / groupsize;
 
     half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    uint32_t w_zero = (w_zeros_.item(group, column) + 1) & 0x0F;
 
     uint32_t w_read = w_.item_uint32_t(row, column);
     half* out_ptr = out_.item_ptr(row, column);
@@ -213,5 +215,6 @@ void Q4Matrix::reconstruct(half* out)
         1
     );
 
-    reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
-}
\ No newline at end of file
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    reconstruct_kernel<<<blocks, threads, 0, stream>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
+}
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
index 50cb72a4..49431dc9 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
@@ -50,4 +50,4 @@ private:
 void g_q4_keep_matrix(Q4Matrix* m);
 void g_q4_free_matrices();
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllama_kernels/exllama_kernels/exllama_ext.cpp b/server/exllama_kernels/exllama_kernels/exllama_ext.cpp
index b786988b..f2df80e8 100644
--- a/server/exllama_kernels/exllama_kernels/exllama_ext.cpp
+++ b/server/exllama_kernels/exllama_kernels/exllama_ext.cpp
@@ -183,6 +183,7 @@ void q4_matmul
 
     int x_height = x.size(0);
 
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd)
     {
         q4_matmul_cuda
@@ -191,7 +192,9 @@ void q4_matmul
             (half*) x.data_ptr(),
             x_height,
             wm,
-            (half*) out.data_ptr()
+            (half*) out.data_ptr(),
+            false,
+            stream
         );
     }
     else
@@ -203,6 +206,7 @@ void q4_matmul
             x_height,
             wm,
             (half*) out.data_ptr(),
+            false,
             at::cuda::getCurrentCUDABlasHandle()
         );
     }
diff --git a/server/exllama_kernels/exllama_kernels/hip_compat.cuh b/server/exllama_kernels/exllama_kernels/hip_compat.cuh
new file mode 100644
index 00000000..f2a3dcad
--- /dev/null
+++ b/server/exllama_kernels/exllama_kernels/hip_compat.cuh
@@ -0,0 +1,52 @@
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+
+#ifndef _hip_compat_cuh
+#define _hip_compat_cuh
+
+// Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6.
+__device__ __forceinline__ __half __compat_hrcp(__half x) {
+    return __half_raw{
+        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
+}
+
+__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
+    return _Float16_2{
+        _Float16_2{static_cast<_Float16>(1.0f),
+            static_cast<_Float16>(1.0f)} / x.data};
+}
+
+#define hrcp __compat_hrcp
+#define h2rcp __compat_h2rcp
+
+// Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf.
+__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
+                                                               hipblasOperation_t transA,
+                                                               hipblasOperation_t transB,
+                                                               int                m,
+                                                               int                n,
+                                                               int                k,
+                                                               const half*        alpha,
+                                                               const half*        AP,
+                                                               int                lda,
+                                                               const half*        BP,
+                                                               int                ldb,
+                                                               const half*        beta,
+                                                               half*              CP,
+                                                               int                ldc) {
+    return hipblasHgemm(handle, transA, transB, m, n, k,
+                        reinterpret_cast<const hipblasHalf *>(alpha),
+                        reinterpret_cast<const hipblasHalf *>(AP), lda,
+                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
+                        reinterpret_cast<const hipblasHalf *>(beta),
+                        reinterpret_cast<hipblasHalf *>(CP), ldc);
+}
+#define hipblasHgemm __compat_hipblasHgemm
+
+// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
+#define rocblas_handle hipblasHandle_t
+#define rocblas_operation_none HIPBLAS_OP_N
+#define rocblas_get_stream hipblasGetStream
+#define rocblas_set_stream hipblasSetStream
+#define rocblas_hgemm __compat_hipblasHgemm
+
+#endif
diff --git a/server/exllama_kernels/exllama_kernels/util.cuh b/server/exllama_kernels/exllama_kernels/util.cuh
index 2839b10f..7b397573 100644
--- a/server/exllama_kernels/exllama_kernels/util.cuh
+++ b/server/exllama_kernels/exllama_kernels/util.cuh
@@ -8,7 +8,11 @@
 #include <cstdint>
 #include <cstdio>
 
+#if defined(USE_ROCM)
+#define cudaUnspecified hipErrorUnknown
+#else
 #define cudaUnspecified cudaErrorApiFailureBase
+#endif
 
 // React to failure on return code != cudaSuccess
 
diff --git a/server/exllamav2_kernels/exllamav2_kernels/config.h b/server/exllamav2_kernels/exllamav2_kernels/config.h
new file mode 100644
index 00000000..32a1a37d
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/config.h
@@ -0,0 +1,15 @@
+#ifndef _config_h
+#define _config_h
+
+#define MAX_Q_GEMM_ROWS 50
+#define MAX_Q_GEMM_WEIGHTS 4  // must be <= MAX_Q_GEMM_ROWS
+
+#define QMODE_2BIT 1
+#define QMODE_3BIT 1
+#define QMODE_4BIT 1
+#define QMODE_5BIT 1
+#define QMODE_6BIT 0
+#define QMODE_8BIT 0
+
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h
new file mode 100644
index 00000000..919703a8
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h
@@ -0,0 +1,12 @@
+#ifndef _util_h
+#define _util_h
+
+#define DBGS(__x) printf("%s\n", __x)
+#define DBGI(__x) printf("%s: %i\n", #__x, __x)
+#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
+#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGF(__x) printf("%s: %f\n", #__x, __x)
+#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
+#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
new file mode 100644
index 00000000..12684ff8
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
@@ -0,0 +1,56 @@
+#ifndef _compat_cuh
+#define _compat_cuh
+
+// atomicAdd for half types, to support CC < 7.x
+
+__device__ __forceinline__ void atomicAdd_half(half* address, half val)
+{
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do
+    {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    }
+    while (assumed != old);
+}
+
+// atomicAdd for half2 types
+
+__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
+{
+    unsigned int* address_as_ui = (unsigned int*)address;
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do
+    {
+        assumed = old;
+        half2 old_val = *((half2*)&old);
+        half2 new_val = __hadd2(old_val, val);
+        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+    }
+    while (assumed != old);
+}
+
+//
+
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
+
+__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+
+#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
+__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+#endif
+
+#endif
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
new file mode 100644
index 00000000..a72bc7bc
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
@@ -0,0 +1,121 @@
+#ifndef _matrix_view_cuh
+#define _matrix_view_cuh
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "quant/qdq_util.cuh"
+
+class MatrixView_half
+{
+public:
+    const half* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
+    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
+    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
+    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
+
+    __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const
+    {
+        half2* ptr = (half2*) item_ptr(row, column);
+        half2 i01 = ptr[0];
+        half2 i23 = ptr[1];
+        items[0] = __low2half(i01);
+        items[1] = __high2half(i01);
+        items[2] = __low2half(i23);
+        items[3] = __high2half(i23);
+    }
+    __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const
+    {
+        half2* ptr = (half2*)item_ptr(row, column);
+        half2 i01 = ptr[0];
+        half2 i23 = ptr[1];
+        items[0] = __half2float(__low2half(i01));
+        items[1] = __half2float(__high2half(i01));
+        items[2] = __half2float(__low2half(i23));
+        items[3] = __half2float(__high2half(i23));
+    }
+
+    __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const
+    {
+        half2* ptr = (half2*)item_ptr(row, column);
+        half2 i01 = ptr[0];
+        half2 i23 = ptr[1];
+        items[0] = __half2half2(__low2half(i01));
+        items[1] = __half2half2(__high2half(i01));
+        items[2] = __half2half2(__low2half(i23));
+        items[3] = __half2half2(__high2half(i23));
+    }
+};
+
+class MatrixView_half_rw
+{
+public:
+    half* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
+    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
+    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
+    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
+    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
+
+    __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3)
+    {
+        half2 v01 = __halves2half2(v0, v1);
+        half2 v23 = __halves2half2(v2, v3);
+        half2* ptr = (half2*) item_ptr(row, column);
+        ptr[0] = v01;
+        ptr[1] = v23;
+    }
+};
+
+class MatrixView_q4_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int shift = (column & 0x07) * 4;
+        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
+    }
+
+    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+    {
+        int shift = (column & 0x07) * 4;
+        uint32_t d = data[row * width / 8 + column / 8] >> shift;
+        items[0] = d & 0x0f;
+        items[1] = (d >> 4) & 0x0f;
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x07) * 4;
+        uint32_t d = data[row * width / 8 + column / 8] >> shift;
+        items[0] = d & 0x0f;
+        items[1] = (d >> 4) & 0x0f;
+        items[2] = (d >> 8) & 0x0f;
+        items[3] = (d >> 12) & 0x0f;
+    }
+};
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
new file mode 100644
index 00000000..5b99f1ba
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
@@ -0,0 +1,220 @@
+#include "q_gemm.cuh"
+#include "util.cuh"
+#include "matrix_view.cuh"
+#include "../config.h"
+
+#include "quant/qdq_2.cuh"
+#include "quant/qdq_3.cuh"
+#include "quant/qdq_4.cuh"
+#include "quant/qdq_5.cuh"
+#include "quant/qdq_6.cuh"
+#include "quant/qdq_8.cuh"
+
+#define GPTQ_BLOCK_KN_SIZE 128
+#define GPTQ_BLOCK_M_SIZE_MAX 8
+#define GPTQ_MAX_GROUPS_IN_BLOCK (GPTQ_BLOCK_KN_SIZE / 32)
+
+#define EXL2_BLOCK_KN_SIZE 64
+#define EXL2_BLOCK_M_SIZE_MAX 8
+#define EXL2_MAX_GROUPS_IN_BLOCK (EXL2_BLOCK_KN_SIZE / 32)
+
+#define CLEAR_N_SIZE 256
+
+#include "q_gemm_kernel.cuh"
+#include "q_gemm_kernel_gptq.cuh"
+
+void gemm_half_q_half_cuda_part
+(
+    const half* a,
+    QMatrix* b,
+    half* c,
+    int size_m,
+    int size_n,
+    int size_k,
+    int m_count,
+    bool clear,
+    const half* r_weights,
+    int r_weights_stride,
+    bool mul_r_weights
+)
+{
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    if (!b->is_gptq)
+    {
+        dim3 blockDim, gridDim;
+        blockDim.x = EXL2_BLOCK_KN_SIZE;
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x = DIVIDE(size_n, EXL2_BLOCK_KN_SIZE * 4);
+        gridDim.y = DIVIDE(size_m, m_count);
+        gridDim.z = DIVIDE(size_k, EXL2_BLOCK_KN_SIZE);
+
+        fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(m_count, r_weights != NULL, mul_r_weights);
+
+        kernel<<<gridDim, blockDim, 0, stream>>>
+        (
+            a,
+            b->cuda_q_weight,
+            b->cuda_q_scale,
+            b->cuda_q_scale_max,
+            c,
+            size_m,
+            size_n,
+            size_k,
+            b->groups,
+            b->cuda_q_group_map,
+            b->cuda_q_perm,
+            b->rows_8,
+            b->rows_6,
+            b->rows_5,
+            b->rows_4,
+            b->rows_3,
+            b->rows_2,
+            clear,
+            r_weights,
+            r_weights_stride
+        );
+    }
+    else
+    {
+        dim3 blockDim, gridDim;
+        blockDim.x = GPTQ_BLOCK_KN_SIZE;
+        blockDim.y = 1;
+        blockDim.z = 1;
+        gridDim.x = DIVIDE(size_n, GPTQ_BLOCK_KN_SIZE * 4);
+        gridDim.y = DIVIDE(size_m, m_count);
+        gridDim.z = DIVIDE(size_k, GPTQ_BLOCK_KN_SIZE);
+
+        fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(m_count, r_weights != NULL, mul_r_weights);
+
+//         DBGX((uint64_t) r_weights);
+//         if (r_weights)
+//             print_global_mem(r_weights, 1, 1, 1);
+//         DBGI(r_weights_stride);
+
+        kernel<<<gridDim, blockDim, 0, stream>>>
+        (
+            a,
+            b->cuda_q_weight,
+            b->cuda_gptq_qzeros,
+            b->cuda_gptq_scales,
+            c,
+            size_m,
+            size_n,
+            size_k,
+            b->groups,
+            b->gptq_groupsize,
+            b->cuda_q_perm,
+            b->rows_4,
+            clear,
+            r_weights,
+            r_weights_stride
+        );
+    }
+}
+
+void gemm_half_q_half_cuda
+(
+    cublasHandle_t cublas_handle,
+    const half* a,
+    QMatrix* b,
+    half* c,
+    int size_m,
+    int size_n,
+    int size_k,
+    bool clear,
+    half* temp_dq,
+    bool force_cuda,
+    const half* r_weights,
+    const int r_weights_stride,
+    bool mul_r_weights
+)
+{
+    if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
+    {
+        // Reconstruct FP16 matrix, then cuBLAS
+
+        if (!temp_dq) temp_dq = b->temp_dq;
+        b->reconstruct(temp_dq);
+
+        //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
+
+        const half alpha = __float2half(1.0f);
+        const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
+        cublasHgemm(cublas_handle,
+                    CUBLAS_OP_N,
+                    CUBLAS_OP_N,
+                    size_n, size_m, size_k,
+                    &alpha, temp_dq, size_n,
+                            a,       size_k,
+                    &beta,  c,       size_n);
+
+        //const float alpha = 1.0f;
+        //const float beta = clear ? 0.0f : 1.0f;
+        //cublasSgemmEx(cublas_handle,
+        //             CUBLAS_OP_N,
+        //             CUBLAS_OP_N,
+        //             size_n, size_m, size_k,
+        //             &alpha, temp_dq, CUDA_R_16F, size_n,
+        //                     a,       CUDA_R_16F, size_k,
+        //             &beta,  c,       CUDA_R_16F, size_n);
+
+        //const float alpha = 1.0f;
+        //const float beta = clear ? 0.0f : 1.0f;
+        //cublasGemmEx(cublas_handle,
+        //             CUBLAS_OP_N, CUBLAS_OP_N,
+        //             size_n, size_m, size_k,
+        //             &alpha, temp_dq, CUDA_R_16F, size_n,
+        //                     a,       CUDA_R_16F, size_k,
+        //             &beta,  c,       CUDA_R_16F, size_n,
+        //             CUDA_R_16F, CUBLAS_GEMM_DFALT_TENSOR_OP);
+    }
+    else
+    {
+        // Quantized matmul
+
+        int block_m_size_max = b->is_gptq ? GPTQ_BLOCK_M_SIZE_MAX : EXL2_BLOCK_M_SIZE_MAX;
+        int max_chunks = size_m / block_m_size_max;
+        int last_chunk = max_chunks * block_m_size_max;
+        int last_chunk_size = size_m - last_chunk;
+
+        if (max_chunks)
+        {
+            gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, block_m_size_max, clear, r_weights, r_weights_stride, mul_r_weights);
+        }
+
+        if (last_chunk_size)
+        {
+            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear, r_weights, r_weights_stride, mul_r_weights);
+        }
+    }
+}
+
+__global__ void clear_kernel
+(
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n
+)
+{
+    int m = blockIdx.y;
+    int n = (blockIdx.x * CLEAR_N_SIZE + threadIdx.x) * 8;
+    if (n >= size_n) return;
+    int4* c_ptr = (int4*)(c + m * size_n + n);
+    *c_ptr = {};
+}
+
+void clear_tensor_cuda
+(
+    half* c,
+    int size_m,
+    int size_n
+)
+{
+//     dim3 blockDim, gridDim;
+//     blockDim.x = CLEAR_N_SIZE;
+//     blockDim.y = 1;
+//     gridDim.x = DIVIDE(size_n / 8, CLEAR_N_SIZE);
+//     gridDim.y = size_m;
+//     clear_kernel<<<gridDim, blockDim>>>(c, size_m, size_n);
+}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
new file mode 100644
index 00000000..e49457f3
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
@@ -0,0 +1,36 @@
+#ifndef _q_gemm_cuh
+#define _q_gemm_cuh
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+#include <ATen/cuda/CUDAContext.h>
+
+#include "q_matrix.cuh"
+
+void gemm_half_q_half_cuda
+(
+    cublasHandle_t cublas_handle,
+    const half* a,
+    QMatrix* b,
+    half* c,
+    int size_m,
+    int size_n,
+    int size_k,
+    bool clear = false,
+    half* reconstruct = NULL,
+    bool force_cuda = false,
+    const half* r_weights = NULL,
+    const int r_weights_stride = 0,
+    bool mul_r_weights = false
+);
+
+void clear_tensor_cuda
+(
+    half* c,
+    int size_m,
+    int size_n
+);
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh
new file mode 100644
index 00000000..9cd2ba01
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh
@@ -0,0 +1,580 @@
+#include "compat.cuh"
+
+__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+    return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+    return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+    return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h)
+{
+    // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127
+
+    float result = {};
+    #pragma unroll
+    for (int i = 0; i < 4; i++)
+    {
+        half2 w01 = dq[i];
+        float w0 = __low2float(w01);
+        float w1 = __high2float(w01);
+        float x0 = __half2float(*a_ptr++);
+        float x1 = __half2float(*a_ptr++);
+        result = fma(w0, x0, result);
+        result = fma(w1, x1, result);
+    }
+    float qs = __half2float(qs_h);
+    result *= qs;
+    half result_h = __float2half_rn(result);
+    return __hadd(result_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    half result_h = __hadd(__low2half(result), __high2half(result));
+    return __hfma(result_h, qs_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+    half result_h = __hadd(__low2half(result), __high2half(result));
+    return __hfma(result_h, qs_h, g_result);
+}
+
+
+typedef void (*fp_gemm_half_q_half_kernel)
+(
+    const half*,
+    const uint32_t*,
+    const uint32_t*,
+    const half*,
+    half*,
+    const int,
+    const int,
+    const int,
+    const int,
+    const uint16_t*,
+    const uint16_t*,
+    const int,
+    const int,
+    const int,
+    const int,
+    const int,
+    const int,
+    const bool,
+    const half*,
+    const int
+);
+
+template <int m_count, bool use_r_weights, bool mul_r_weights>
+__global__ void gemm_half_q_half_kernel
+(
+    const half*      __restrict__ a,
+    const uint32_t*  __restrict__ b_q_weight,
+    const uint32_t*  __restrict__ b_q_scale,
+    const half*      __restrict__ b_q_scale_max,
+    half*            __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const uint16_t* __restrict__ b_q_group_map,
+    const uint16_t* __restrict__ b_q_perm,
+    const int rows_8,
+    const int rows_6,
+    const int rows_5,
+    const int rows_4,
+    const int rows_3,
+    const int rows_2,
+    const bool clear,
+    const half* r_weights,
+    const int r_weights_stride
+)
+{
+    MatrixView_half a_(a, size_m, size_k);
+    MatrixView_half_rw c_(c, size_m, size_n);
+    MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);
+
+    int t = threadIdx.x;
+
+    // Block
+
+    int offset_n = blockIdx.x * EXL2_BLOCK_KN_SIZE * 4;
+    int offset_m = blockIdx.y * m_count;
+    int offset_k = blockIdx.z * EXL2_BLOCK_KN_SIZE;
+
+    int end_n = min(offset_n + EXL2_BLOCK_KN_SIZE * 4, size_n);
+    int end_m = min(offset_m + m_count, size_m);
+    int end_k = min(offset_k + EXL2_BLOCK_KN_SIZE, size_k);
+    int n = offset_n + t * 4;
+
+    // Read weights
+
+    half_uint16 weights[MAX_Q_GEMM_WEIGHTS];
+    if constexpr (use_r_weights)
+    {
+        uint16_t any_w = 0;
+        const half* w_ptr = r_weights;
+        for (int m = 0; m < m_count; ++m)
+        {
+            weights[m].as_half = *w_ptr;
+            w_ptr += r_weights_stride;
+            any_w |= weights[m].as_uint16;
+        }
+        if (!any_w) return;  // Early exit if all weights are zero -- does not zero output (!!!)
+    }
+
+    // Preload block_a
+
+    __shared__ half block_a[m_count][EXL2_BLOCK_KN_SIZE];
+
+    if (offset_k + t < end_k)
+    {
+        for (int m = 0; m < m_count; ++m)
+        {
+            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+            half* block_a_ptr = block_a[m];
+            half a0 = a_ptr[b_q_perm[offset_k + t]];
+//            half a0 = a_ptr[offset_k + t];
+            block_a_ptr[t] = a0;
+        }
+    }
+
+    // Clear
+
+    if (n >= size_n) return;
+
+    if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
+    {
+        for (int m = 0; m < m_count; m++)
+            *((uint64_t*) c_.item_ptr(offset_m + m, n)) = 0;
+    }
+
+    __syncthreads();
+
+    // Find initial group
+
+    //int group = offset_k / groupsize;
+    int group = b_q_group_map[offset_k * 2];
+
+//    if (offset_m == 0 && t == 0)
+//        DBGI2(offset_k, group);
+
+    // Preload scales
+
+    half scales[EXL2_MAX_GROUPS_IN_BLOCK][4];
+
+    //int groups_in_block = DIVIDE((end_k - offset_k), groupsize);
+    int temp_k = offset_k;
+    for (int g = 0; temp_k < end_k; g++)
+    {
+        int qscales[4];
+        b_q_scale_.item4(qscales, group + g, n);
+        qscales[0]++;
+        qscales[1]++;
+        qscales[2]++;
+        qscales[3]++;
+        half maxscale = b_q_scale_max[group + g];
+        scales[g][0] = __hmul(__int2half_rn(qscales[0] * qscales[0]), maxscale);
+        scales[g][1] = __hmul(__int2half_rn(qscales[1] * qscales[1]), maxscale);
+        scales[g][2] = __hmul(__int2half_rn(qscales[2] * qscales[2]), maxscale);
+        scales[g][3] = __hmul(__int2half_rn(qscales[3] * qscales[3]), maxscale);
+        temp_k += b_q_group_map[temp_k * 2 + 1];
+    }
+
+    // a, b offset
+
+    int pre_rows_8 = min(rows_8, offset_k);
+    int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
+    int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
+    int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
+    int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
+    int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
+    int qk = 0;
+    qk += pre_rows_8 / 32 * 8;
+    qk += pre_rows_6 / 32 * 6;
+    qk += pre_rows_5 / 32 * 5;
+    qk += pre_rows_4 / 32 * 4;
+    qk += pre_rows_3 / 32 * 3;
+    qk += pre_rows_2 / 32 * 2;
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+    const half* a_ptr = &block_a[0][0];
+    int a_stride = EXL2_BLOCK_KN_SIZE;
+
+    // Initial group
+
+    int scales_idx = 0;
+    half qs_h0 = scales[scales_idx][0];
+    half qs_h1 = scales[scales_idx][1];
+    half qs_h2 = scales[scales_idx][2];
+    half qs_h3 = scales[scales_idx][3];
+    int nextgroup = offset_k + b_q_group_map[offset_k * 2 + 1];
+
+    // Column result
+
+    half block_c[m_count][4] = {};
+
+    // Dequantize groups
+
+    int k = offset_k;
+
+    while (k < rows_8 && k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            scales_idx++;
+            qs_h0 = scales[scales_idx][0];
+            qs_h1 = scales[scales_idx][1];
+            qs_h2 = scales[scales_idx][2];
+            qs_h3 = scales[scales_idx][3];
+            nextgroup += b_q_group_map[k * 2 + 1];
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            int4 load_int4[2];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][4];
+            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n);
+            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n);
+            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n);
+            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
+                block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
+                block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
+                block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
+            }
+            a_ptr += 8;
+        }
+        k += 32;
+    }
+
+    while (k < rows_6 && k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            scales_idx++;
+            qs_h0 = scales[scales_idx][0];
+            qs_h1 = scales[scales_idx][1];
+            qs_h2 = scales[scales_idx][2];
+            qs_h3 = scales[scales_idx][3];
+            nextgroup += b_q_group_map[k * 2 + 1];
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 2; j++)
+        {
+            int4 load_int4[3];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][8];
+            dequant_6bit_16(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
+            dequant_6bit_16(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
+            dequant_6bit_16(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
+            dequant_6bit_16(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
+                block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
+                block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
+                block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
+            }
+            a_ptr += 16;
+        }
+        k += 32;
+    }
+
+    while (k < rows_5 && k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            scales_idx++;
+            qs_h0 = scales[scales_idx][0];
+            qs_h1 = scales[scales_idx][1];
+            qs_h2 = scales[scales_idx][2];
+            qs_h3 = scales[scales_idx][3];
+            nextgroup += b_q_group_map[k * 2 + 1];
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 1; j++)
+        {
+            int4 load_int4[5];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[3] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[4] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][16];
+            dequant_5bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, load_int4[3].x, load_int4[4].x, dq[0], size_n);
+            dequant_5bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, load_int4[3].y, load_int4[4].y, dq[1], size_n);
+            dequant_5bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, load_int4[3].z, load_int4[4].z, dq[2], size_n);
+            dequant_5bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, load_int4[3].w, load_int4[4].w, dq[3], size_n);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
+                block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
+                block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
+                block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
+            }
+            a_ptr += 32;
+        }
+
+        k += 32;
+    }
+
+    while (k < rows_4 && k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            scales_idx++;
+            qs_h0 = scales[scales_idx][0];
+            qs_h1 = scales[scales_idx][1];
+            qs_h2 = scales[scales_idx][2];
+            qs_h3 = scales[scales_idx][3];
+            nextgroup += b_q_group_map[k * 2 + 1];
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            int4 load_int4[1];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][4];
+            dequant_4bit_8(load_int4[0].x, dq[0], size_n);
+            dequant_4bit_8(load_int4[0].y, dq[1], size_n);
+            dequant_4bit_8(load_int4[0].z, dq[2], size_n);
+            dequant_4bit_8(load_int4[0].w, dq[3], size_n);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
+                block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
+                block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
+                block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
+            }
+            a_ptr += 8;
+        }
+        k += 32;
+    }
+
+    while (k < rows_3 && k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            scales_idx++;
+            qs_h0 = scales[scales_idx][0];
+            qs_h1 = scales[scales_idx][1];
+            qs_h2 = scales[scales_idx][2];
+            qs_h3 = scales[scales_idx][3];
+            nextgroup += b_q_group_map[k * 2 + 1];
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 1; j++)
+        {
+            int4 load_int4[3];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][16];
+            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
+            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
+            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
+            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
+                block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
+                block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
+                block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
+            }
+            a_ptr += 32;
+        }
+        k += 32;
+    }
+
+    while (k < rows_2 && k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            scales_idx++;
+            qs_h0 = scales[scales_idx][0];
+            qs_h1 = scales[scales_idx][1];
+            qs_h2 = scales[scales_idx][2];
+            qs_h3 = scales[scales_idx][3];
+            nextgroup += b_q_group_map[k * 2 + 1];
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 1; j++)
+        {
+            int4 load_int4[1];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][8];
+            dequant_2bit_16(load_int4[0].x, dq[0], size_n);
+            dequant_2bit_16(load_int4[0].y, dq[1], size_n);
+            dequant_2bit_16(load_int4[0].z, dq[2], size_n);
+            dequant_2bit_16(load_int4[0].w, dq[3], size_n);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_h0);
+                block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_h1);
+                block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_h2);
+                block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_h3);
+            }
+
+            a_ptr += 16;
+        }
+        k += 16;
+    }
+
+    // Accumulate column sums in c
+
+    for (int m = 0; m < m_count; m++)
+    {
+        half2* out = (half2*)c_.item_ptr(offset_m + m, n);
+        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+
+        if constexpr (mul_r_weights)
+        {
+            half2 w_mul2 = __half2half2(weights[m].as_half);
+            result01 = __hmul2(result01, w_mul2);
+            result23 = __hmul2(result23, w_mul2);
+        }
+
+        atomicAdd(out    , result01);
+        atomicAdd(out + 1, result23);
+//        *out = result01;
+//        *(out + 1) = result23;
+    }
+}
+
+template <bool use_r_weights, bool mul_r_weights>
+struct map_m_count_exl2 {
+    static constexpr fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(const int m_count)
+    {
+        #if EXL2_BLOCK_M_SIZE_MAX >= 1
+        if (m_count == 1) return gemm_half_q_half_kernel<1, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 2
+        if (m_count == 2) return gemm_half_q_half_kernel<2, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 3
+        if (m_count == 3) return gemm_half_q_half_kernel<3, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 4
+        if (m_count == 4) return gemm_half_q_half_kernel<4, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 5
+        if (m_count == 5) return gemm_half_q_half_kernel<5, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 6
+        if (m_count == 6) return gemm_half_q_half_kernel<6, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 7
+        if (m_count == 7) return gemm_half_q_half_kernel<7, use_r_weights, mul_r_weights>;
+        #endif
+        #if EXL2_BLOCK_M_SIZE_MAX >= 8
+        if (m_count == 8) return gemm_half_q_half_kernel<8, use_r_weights, mul_r_weights>;
+        #endif
+        return NULL;
+    }
+};
+
+fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(const int m_count, bool r_weights, bool mul_r_weights)
+{
+    if (!r_weights && !mul_r_weights) return map_m_count_exl2<false, false>::pick_gemm_half_q_half_kernel(m_count);
+    if (!r_weights &&  mul_r_weights) return map_m_count_exl2<false,  true>::pick_gemm_half_q_half_kernel(m_count);
+    if ( r_weights && !mul_r_weights) return map_m_count_exl2< true, false>::pick_gemm_half_q_half_kernel(m_count);
+    if ( r_weights &&  mul_r_weights) return map_m_count_exl2< true,  true>::pick_gemm_half_q_half_kernel(m_count);
+    return NULL;
+}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh
new file mode 100644
index 00000000..f816fd9d
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh
@@ -0,0 +1,273 @@
+#include "compat.cuh"
+
+__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hadd2(result, g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __half2float(__low2half(result)) + __half2float(__high2half(result));
+}
+
+__forceinline__ __device__ half2 dot22_8_h2(half2(&dq)[4], const half* a_ptr)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return result;
+}
+
+typedef void (*fp_gemm_half_q_half_gptq_kernel)
+(
+    const half*,
+    const uint32_t*,
+    const uint32_t*,
+    const half*,
+    half*,
+    const int,
+    const int,
+    const int,
+    const int,
+    const int,
+    const uint16_t*,
+    const int,
+    const bool,
+    const half*,
+    const int
+);
+
+template <int m_count, bool use_r_weights, bool mul_r_weights>
+__global__ void gemm_half_q_half_gptq_kernel
+(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int groupsize,
+    const uint16_t* __restrict__ b_q_perm,
+    const int rows_4,
+    const bool clear,
+    const half* r_weights,
+    const int r_weights_stride
+)
+{
+    MatrixView_half a_(a, size_m, size_k);
+    MatrixView_half_rw c_(c, size_m, size_n);
+    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+    int t = threadIdx.x;
+
+    // Block
+
+    int offset_n = blockIdx.x * GPTQ_BLOCK_KN_SIZE * 4;
+    int offset_m = blockIdx.y * m_count;
+    int offset_k = blockIdx.z * GPTQ_BLOCK_KN_SIZE;
+
+    int end_n = min(offset_n + GPTQ_BLOCK_KN_SIZE * 4, size_n);
+    int end_m = min(offset_m + m_count, size_m);
+    int end_k = min(offset_k + GPTQ_BLOCK_KN_SIZE, size_k);
+
+    int n = offset_n + t * 4;
+
+    // Read weights
+
+    half_uint16 weights[MAX_Q_GEMM_WEIGHTS];
+    if constexpr (use_r_weights)
+    {
+        uint16_t any_w = 0;
+        const half* w_ptr = r_weights;
+        for (int m = 0; m < m_count; ++m)
+        {
+            weights[m].as_half = *w_ptr;
+            w_ptr += r_weights_stride;
+            any_w |= weights[m].as_uint16;
+        }
+        if (!any_w) return;  // Early exit if all weights are zero -- does not zero output (!!!)
+    }
+
+    // Preload block_a
+
+    __shared__ half block_a[m_count][GPTQ_BLOCK_KN_SIZE];
+
+    if (offset_k + t < end_k)
+    {
+        for (int m = 0; m < m_count; ++m)
+        {
+            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+            half* block_a_ptr = block_a[m];
+
+            half a0;
+            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+            else a0 = a_ptr[offset_k + t];
+            block_a_ptr[t] = a0;
+        }
+    }
+
+    // Zero output
+
+    if (n >= size_n) return;
+
+    if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
+    {
+        for (int m = 0; m < m_count; m++)
+            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+    }
+
+    __syncthreads();
+
+    // Find initial group
+
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // a, b offset
+
+    int qk = offset_k / (32 / 4);
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+    const half* a_ptr = &block_a[0][0];
+    int a_stride = GPTQ_BLOCK_KN_SIZE;
+
+    // Initial group
+
+    int zeros[4];
+    half2 scales[4];
+    half2 z1z16[4][2];
+    half2 y1y16[4][2];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4_h2(scales, group, n);
+    dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
+    dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
+    dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
+    dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);
+
+//    __syncthreads();
+
+    // Column result
+
+    half2 block_c[m_count][4] = {};
+
+    // Dequantize and multiply
+
+    int k = offset_k;
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4_h2(scales, group, n);
+            dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
+            dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
+            dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
+            dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            const int4* b_ptr4 = (int4*) b_ptr;
+            int4 load_int4 = *b_ptr4;
+
+            half2 dq[4][4];
+            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+
+            #pragma unroll
+            for (int m = 0; m < m_count; m++)
+            {
+                if constexpr (use_r_weights) { if (!weights[m].as_uint16) continue; }
+                block_c[m][0] = __hfma2(dot22_8_h2(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]);
+                block_c[m][1] = __hfma2(dot22_8_h2(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]);
+                block_c[m][2] = __hfma2(dot22_8_h2(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]);
+                block_c[m][3] = __hfma2(dot22_8_h2(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]);
+            }
+
+            b_ptr += size_n;
+            a_ptr += 8;
+        }
+
+        k += 32;
+    }
+
+    for (int m = 0; m < m_count; m++)
+    {
+        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+        half result0 = __hadd(__low2half(block_c[m][0]), __high2half(block_c[m][0]));
+        half result1 = __hadd(__low2half(block_c[m][1]), __high2half(block_c[m][1]));
+        half result2 = __hadd(__low2half(block_c[m][2]), __high2half(block_c[m][2]));
+        half result3 = __hadd(__low2half(block_c[m][3]), __high2half(block_c[m][3]));
+        half2 result01 = __halves2half2(result0, result1);
+        half2 result23 = __halves2half2(result2, result3);
+
+        if constexpr (mul_r_weights)
+        {
+            half2 w_mul2 = __half2half2(weights[m].as_half);
+            result01 = __hmul2(result01, w_mul2);
+            result23 = __hmul2(result23, w_mul2);
+        }
+
+        atomicAdd(out    , result01);
+        atomicAdd(out + 1, result23);
+    }
+}
+
+template <bool use_r_weights, bool mul_r_weights>
+struct map_m_count_gptq {
+    static constexpr fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(int m_count)
+    {
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 1
+        if (m_count == 1) return gemm_half_q_half_gptq_kernel<1, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 2
+        if (m_count == 2) return gemm_half_q_half_gptq_kernel<2, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 3
+        if (m_count == 3) return gemm_half_q_half_gptq_kernel<3, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 4
+        if (m_count == 4) return gemm_half_q_half_gptq_kernel<4, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 5
+        if (m_count == 5) return gemm_half_q_half_gptq_kernel<5, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 6
+        if (m_count == 6) return gemm_half_q_half_gptq_kernel<6, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 7
+        if (m_count == 7) return gemm_half_q_half_gptq_kernel<7, use_r_weights, mul_r_weights>;
+        #endif
+        #if GPTQ_BLOCK_M_SIZE_MAX >= 8
+        if (m_count == 8) return gemm_half_q_half_gptq_kernel<8, use_r_weights, mul_r_weights>;
+        #endif
+        return NULL;
+    }
+};
+
+fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(const int m_count, bool r_weights, bool mul_r_weights)
+{
+    if (!r_weights && !mul_r_weights) return map_m_count_gptq<false, false>::pick_gemm_half_q_half_gptq_kernel(m_count);
+    if (!r_weights &&  mul_r_weights) return map_m_count_gptq<false,  true>::pick_gemm_half_q_half_gptq_kernel(m_count);
+    if ( r_weights && !mul_r_weights) return map_m_count_gptq< true, false>::pick_gemm_half_q_half_gptq_kernel(m_count);
+    if ( r_weights &&  mul_r_weights) return map_m_count_gptq< true,  true>::pick_gemm_half_q_half_gptq_kernel(m_count);
+    return NULL;
+}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
new file mode 100644
index 00000000..f7a91e29
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
@@ -0,0 +1,650 @@
+#include "q_matrix.cuh"
+#include "matrix_view.cuh"
+#include "util.cuh"
+
+#include "quant/qdq_2.cuh"
+#include "quant/qdq_3.cuh"
+#include "quant/qdq_4.cuh"
+#include "quant/qdq_5.cuh"
+#include "quant/qdq_6.cuh"
+#include "quant/qdq_8.cuh"
+
+#define BLOCK_KN_SIZE 128
+
+#define THREADS_X 32
+#define THREADS_Y 32
+
+// Shuffle quantized data on load
+
+__global__ void shuffle_kernel
+(
+    uint32_t* __restrict__ b_q_weight,
+    const int size_k,
+    const int size_n,
+    const int rows_8,
+    const int rows_6,
+    const int rows_5,
+    const int rows_4,
+    const int rows_3,
+    const int rows_2
+)
+{
+    int n = blockIdx.x * THREADS_X + threadIdx.x;
+    if (n >= size_n) return;
+    int k = 0;
+    uint32_t* b_ptr = b_q_weight + n;
+    while (k < rows_8) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  4; }
+    while (k < rows_6) { shuffle_6bit_16(b_ptr, size_n); b_ptr += 3 * size_n; k += 16; }
+    while (k < rows_5) { shuffle_5bit_32(b_ptr, size_n); b_ptr += 5 * size_n; k += 32; }
+    while (k < rows_4) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  8; }
+    while (k < rows_3) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32; }
+    while (k < rows_2) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16; }
+}
+
+
+// QMatrix constructor
+
+QMatrix::QMatrix
+(
+    const int _device,
+    const int _height,
+    const int _width,
+    const int _groups,
+
+    uint32_t* _q_weight,
+    uint16_t* _q_perm,
+    uint16_t* _q_invperm,
+    uint32_t* _q_scale,
+    half* _q_scale_max,
+    uint16_t* _q_groups,
+    uint16_t* _q_group_map,
+
+    uint32_t* _gptq_qzeros,
+    half* _gptq_scales,
+    uint32_t* _gptq_g_idx,
+
+    half* _temp_dq
+) :
+    device(_device),
+    height(_height),
+    width(_width),
+    groups(_groups),
+    temp_dq(_temp_dq)
+{
+    cudaSetDevice(device);
+
+    failed = false;
+
+    cuda_q_weight = _q_weight;
+    cuda_q_perm = _q_perm;
+    cuda_q_invperm = _q_invperm;
+    cuda_q_scale = _q_scale;
+    cuda_q_scale_max = _q_scale_max;
+    cuda_q_groups = _q_groups;
+    cuda_q_group_map = _q_group_map;
+    cuda_gptq_qzeros = _gptq_qzeros;
+    cuda_gptq_scales = _gptq_scales;
+
+    is_gptq = (_gptq_qzeros != NULL);
+
+    if (is_gptq)
+    {
+        gptq_groupsize = 1;
+        while (gptq_groupsize * groups < height) gptq_groupsize *= 2;
+    }
+
+    // Create group map
+
+    rows_8 = 0;
+    rows_6 = 0;
+    rows_5 = 0;
+    rows_4 = 0;
+    rows_3 = 0;
+    rows_2 = 0;
+
+    if (!is_gptq)
+    {
+        uint16_t* cpu_q_groups = (uint16_t*)calloc(groups * 2, sizeof(uint16_t));
+        cudaMemcpy(cpu_q_groups, cuda_q_groups, groups * 2 * sizeof(uint16_t), cudaMemcpyDeviceToHost);
+
+        int row = 0;
+        for (int i = 0; i < groups; i++)
+        {
+            int bits = cpu_q_groups[i * 2];
+
+            int rows;
+            if (i < groups - 1)
+            {
+                int qrows = cpu_q_groups[i * 2 + 3] - cpu_q_groups[i * 2 + 1];
+                rows = qrows * 32 / bits;
+            }
+            else rows = height - row;
+
+            if (bits == 8) rows_8 += rows;
+            if (bits == 6) rows_6 += rows;
+            if (bits == 5) rows_5 += rows;
+            if (bits == 4) rows_4 += rows;
+            if (bits == 3) rows_3 += rows;
+            if (bits == 2) rows_2 += rows;
+            row += rows;
+        }
+
+        free(cpu_q_groups);
+
+        rows_6 += rows_8;
+        rows_5 += rows_6;
+        rows_4 += rows_5;
+        rows_3 += rows_4;
+        rows_2 += rows_3;
+    }
+    else
+    {
+        rows_4 = height;
+        rows_3 = height;
+        rows_2 = height;
+
+        if (_gptq_g_idx)
+        {
+            if (!make_sequential(_gptq_g_idx))
+            {
+                failed = true;
+                //printf("FAIL\n");
+                return;
+            }
+        }
+    }
+
+//     DBGI(rows_8);
+//     DBGI(rows_6);
+//     DBGI(rows_5);
+//     DBGI(rows_4);
+//     DBGI(rows_3);
+//     DBGI(rows_2);
+
+    // Shuffle quantized data
+
+    dim3 blockDim, gridDim;
+    blockDim.x = THREADS_X;
+    blockDim.y = 1;
+    gridDim.x = DIVIDE(width, THREADS_X);
+    gridDim.y = 1;
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(cuda_q_weight, height, width, rows_8, rows_6, rows_5, rows_4, rows_3, rows_2);
+}
+
+QMatrix::~QMatrix()
+{
+}
+
+// Reconstruct b[k,n] (GPTQ)
+
+__global__ void reconstruct_gptq_kernel
+(
+    const uint32_t* __restrict__ b_q_weight,
+    const uint16_t* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    //const uint16_t* __restrict__ b_q_groups,
+    const int size_k,
+    const int size_n,
+    const int groupsize,
+    const int groups,
+    half* __restrict__ b,
+    const int rows_4
+)
+{
+    MatrixView_half_rw b_(b, size_k, size_n);
+    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    // Preload remapping table
+
+    __shared__ uint16_t perm[BLOCK_KN_SIZE];
+    int t = threadIdx.x;
+
+    if (b_q_perm)
+    {
+        if (offset_k + t < size_k)
+            perm[t] = b_q_perm[offset_k + t];
+    }
+
+    // Column
+
+    int n = offset_n + t * 4;
+    if (n >= size_n) return;
+
+    // Find initial group
+
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // b offset
+
+    int qk = offset_k / (32 / 4);
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+    // Initial zeros/scale
+
+    int zeros[4];
+    half2 scales[4];
+    half2 z1z16[4][2];
+    half2 y1y16[4][2];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4_h2(scales, group, n);
+    dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
+    dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
+    dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
+    dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);
+
+    __syncthreads();
+
+    int k = offset_k;
+    int lk = 0;
+
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4_h2(scales, group, n);
+            dequant_4bit_8_prep_zero((zeros[0] + 1) & 0x0F, z1z16[0], y1y16[0]);
+            dequant_4bit_8_prep_zero((zeros[1] + 1) & 0x0F, z1z16[1], y1y16[1]);
+            dequant_4bit_8_prep_zero((zeros[2] + 1) & 0x0F, z1z16[2], y1y16[2]);
+            dequant_4bit_8_prep_zero((zeros[3] + 1) & 0x0F, z1z16[3], y1y16[3]);
+        }
+
+        for (int p = 0; p < 4; p++)
+        {
+            half2 dq[4][4];
+            const int4* b_ptr4 = (int4*) b_ptr;
+            int4 load_int4 = *b_ptr4;
+
+            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+
+            b_ptr += size_n;
+            //half* dqh = (half*)dq;
+            if (b_q_perm)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+            else
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+        }
+        k += 32;
+    }
+}
+
+
+// Reconstruct b[k,n]
+
+__global__ void reconstruct_kernel
+(
+    const uint32_t* __restrict__ b_q_weight,
+    const uint16_t* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_q_scale,
+    const half* __restrict__ b_q_scale_max,
+    const uint16_t* __restrict__ b_q_group_map,
+    const int size_k,
+    const int size_n,
+    //const int groupsize,
+    const int groups,
+    half* __restrict__ b,
+    const int rows_8,
+    const int rows_6,
+    const int rows_5,
+    const int rows_4,
+    const int rows_3,
+    const int rows_2
+)
+{
+    MatrixView_half_rw b_(b, size_k, size_n);
+    MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);
+
+    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+    int offset_n = BLOCK_KN_SIZE * blockIdx.x;
+
+    // Preload remapping table
+
+    int t = threadIdx.x;
+    __shared__ uint16_t perm[BLOCK_KN_SIZE];
+    if (offset_k + t < size_k)
+        perm[t] = b_q_perm[offset_k + t];
+
+    // Column
+
+    int n = offset_n + t;
+    if (n >= size_n) return;
+
+    // Find initial group
+
+    // int group = offset_k / groupsize;
+    int group = b_q_group_map[offset_k * 2];
+
+    int pre_rows_8 = min(rows_8, offset_k);
+    int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
+    int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
+    int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
+    int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
+    int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
+    int qk = 0;
+    qk += pre_rows_8 / 32 * 8;
+    qk += pre_rows_6 / 32 * 6;
+    qk += pre_rows_5 / 32 * 5;
+    qk += pre_rows_4 / 32 * 4;
+    qk += pre_rows_3 / 32 * 3;
+    qk += pre_rows_2 / 32 * 2;
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+    half qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]);
+    half2 qs_h2 = __halves2half2(qs_h, qs_h);
+    int nextgroup = offset_k + b_q_group_map[offset_k * 2 + 1];
+
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+    int k = offset_k;
+    int lk = 0;
+
+    __syncthreads();
+
+    while (k < rows_8 && k < end_k)
+    {
+        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
+        for (int p = 0; p < 4; p++)
+        {
+            half2 dq[4];
+            uint32_t q_0 = *b_ptr; b_ptr += size_n;
+            uint32_t q_1 = *b_ptr; b_ptr += size_n;
+            dequant_8bit_8(q_0, q_1, dq, size_n);
+            for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2);
+            half* dqh = (half*) dq;
+            for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]);
+        }
+        k += 32;
+    }
+
+    while (k < rows_6 && k < end_k)
+    {
+        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
+        for (int p = 0; p < 2; p++)
+        {
+            half2 dq[8];
+            uint32_t q_0 = *b_ptr; b_ptr += size_n;
+            uint32_t q_1 = *b_ptr; b_ptr += size_n;
+            uint32_t q_2 = *b_ptr; b_ptr += size_n;
+            dequant_6bit_16(q_0, q_1, q_2, dq, size_n);
+            for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2);
+            half* dqh = (half*) dq;
+            for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]);
+        }
+        k += 32;
+    }
+
+    while (k < rows_5 && k < end_k)
+    {
+        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
+        for (int p = 0; p < 1; p++)
+        {
+            half2 dq[16];
+            uint32_t q_0 = *b_ptr; b_ptr += size_n;
+            uint32_t q_1 = *b_ptr; b_ptr += size_n;
+            uint32_t q_2 = *b_ptr; b_ptr += size_n;
+            uint32_t q_3 = *b_ptr; b_ptr += size_n;
+            uint32_t q_4 = *b_ptr; b_ptr += size_n;
+            dequant_5bit_32(q_0, q_1, q_2, q_3, q_4, dq, size_n);
+            for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2);
+            half* dqh = (half*) dq;
+            for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]);
+        }
+        k += 32;
+    }
+
+    while (k < rows_4 && k < end_k)
+    {
+        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
+        for (int p = 0; p < 4; p++)
+        {
+            half2 dq[4];
+            uint32_t q_0 = *b_ptr; b_ptr += size_n;
+            dequant_4bit_8(q_0, dq, size_n);
+            for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2);
+            half* dqh = (half*) dq;
+            for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]);
+        }
+        k += 32;
+    }
+
+    while (k < rows_3 && k < end_k)
+    {
+        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
+        for (int p = 0; p < 1; p++)
+        {
+            half2 dq[16];
+            uint32_t q_0 = *b_ptr; b_ptr += size_n;
+            uint32_t q_1 = *b_ptr; b_ptr += size_n;
+            uint32_t q_2 = *b_ptr; b_ptr += size_n;
+            dequant_3bit_32(q_0, q_1, q_2, dq, size_n);
+            for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2);
+            half* dqh = (half*) dq;
+            for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]);
+        }
+        k += 32;
+    }
+
+    while (k < rows_2 && k < end_k)
+    {
+        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += b_q_group_map[k * 2 + 1]; qs_h2 = __halves2half2(qs_h, qs_h); }
+        for (int p = 0; p < 1; p++)
+        {
+            half2 dq[8];
+            uint32_t q_0 = *b_ptr; b_ptr += size_n;
+            dequant_2bit_16(q_0, dq, size_n);
+            for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2);
+            half* dqh = (half*) dq;
+            for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]);
+        }
+        k += 16;
+    }
+}
+
+void QMatrix::reconstruct(half* out)
+{
+    dim3 blockDim, gridDim;
+    blockDim.x = BLOCK_KN_SIZE;
+    blockDim.y = 1;
+    gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    if (!is_gptq)
+    {
+        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+        reconstruct_kernel<<<gridDim, blockDim, 0, stream>>>
+        (
+            cuda_q_weight,
+            cuda_q_perm,
+            cuda_q_scale,
+            cuda_q_scale_max,
+            cuda_q_group_map,
+            height,
+            width,
+            //groupsize,
+            groups,
+            out,
+            rows_8,
+            rows_6,
+            rows_5,
+            rows_4,
+            rows_3,
+            rows_2
+        );
+    }
+    else
+    {
+        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
+        reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>>
+        (
+            cuda_q_weight,
+            cuda_q_perm,
+            cuda_gptq_qzeros,
+            cuda_gptq_scales,
+            //const uint16_t* __restrict__ b_q_groups,
+            height,
+            width,
+            gptq_groupsize,
+            groups,
+            out,
+            rows_4
+        );
+    }
+}
+
+__global__ void make_sequential_kernel
+(
+    const uint32_t* __restrict__ w,
+    uint32_t* __restrict__ w_new,
+    const uint16_t* __restrict__ q_perm,
+    const int w_height,
+    const int w_width
+)
+{
+    const uint64_t* w2 = (uint64_t*) w;
+    uint64_t* w_new2 = (uint64_t*) w_new;
+    int w2_stride = w_width >> 1;
+
+    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+    if (w2_column >= w2_stride) return;
+
+    int w_new2_row = blockIdx.y;
+
+    int q_perm_idx = w_new2_row << 3;
+
+    uint64_t dst = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+    {
+        int source_row = q_perm[q_perm_idx++];
+
+        int w2_row = source_row >> 3;
+        int w2_subrow = source_row & 0x07;
+        int w2_row_shift = w2_subrow << 2;
+        int wnew2_row_shift = i << 2;
+
+        uint64_t src = w2[w2_row * w2_stride + w2_column];
+        src >>= w2_row_shift;
+        src &= 0x0000000f0000000f;
+        src <<= wnew2_row_shift;
+        dst |= src;
+    }
+
+    w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+bool QMatrix::make_sequential(const uint32_t* cpu_g_idx)
+{
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    uint32_t* cuda_new_qweight = NULL;
+    cudaError_t err = cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
+    if (err != cudaSuccess) {
+        cudaError_t cuda_status = cudaGetLastError(); // Clear error
+        return false;
+    }
+
+    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
+    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
+    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
+
+    // Group histogram
+
+    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
+
+    // Group map
+
+    for (int i = 0, acc = 0; i < groups; i++)
+    {
+        short tmp = cpu_g_idx_map[i];
+        cpu_g_idx_map[i] = acc;
+        acc += tmp;
+    }
+
+    // X map (inverse)
+
+    for (int row = 0; row < height; row++)
+    {
+        uint32_t target_group = cpu_g_idx[row];
+        uint32_t target_row = cpu_g_idx_map[target_group];
+        cpu_g_idx_map[target_group]++;
+        cpu_x_map_inv[row] = target_row;
+    }
+
+    // X map
+
+    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
+
+    // Reduce to uint16_t
+
+    uint16_t* cpu_x_map16 = (uint16_t*)cpu_x_map;
+    uint16_t* cpu_x_map_inv16 = (uint16_t*)cpu_x_map_inv;
+    for (int row = 0; row < height; row++) cpu_x_map16[row] = (uint16_t) cpu_x_map[row];
+    for (int row = 0; row < height; row++) cpu_x_map_inv16[row] = (uint16_t) cpu_x_map_inv[row];
+
+    // Move to CUDA
+
+    cudaMemcpyAsync(cuda_q_perm, cpu_x_map16, height * sizeof(uint16_t), cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(cuda_q_invperm, cpu_x_map_inv16, height * sizeof(uint16_t), cudaMemcpyHostToDevice);
+
+    // Rearrange rows in w
+
+    dim3 blockDim, gridDim;
+    blockDim.x = THREADS_X;
+    blockDim.y = 1;
+    gridDim.x = DIVIDE(width, THREADS_X);
+    gridDim.y = height / 8;
+
+    make_sequential_kernel<<<gridDim, blockDim, 0, stream>>>
+    (
+        cuda_q_weight,
+        cuda_new_qweight,
+        cuda_q_perm,
+        height / 8,
+        width
+    );
+
+    // Replace qweights
+
+    cudaMemcpyAsync(cuda_q_weight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
+
+    // Cleanup
+
+    cudaDeviceSynchronize();
+
+    cudaFree(cuda_new_qweight);
+    free(cpu_g_idx_map);
+    free(cpu_x_map);
+    free(cpu_x_map_inv);
+
+    return true;
+}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh
new file mode 100644
index 00000000..d36b8d66
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh
@@ -0,0 +1,75 @@
+#ifndef _q_matrix_cuh
+#define _q_matrix_cuh
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+
+#define MAX_SUPERGROUPS 16
+
+class QMatrix
+{
+public:
+
+    int device;
+    bool is_gptq;
+
+    int height;
+    int width;
+    int groups;
+    int gptq_groupsize;
+
+    int rows_8;
+    int rows_6;
+    int rows_5;
+    int rows_4;
+    int rows_3;
+    int rows_2;
+
+    uint32_t* cuda_q_weight = NULL;
+    uint16_t* cuda_q_perm = NULL;
+    uint16_t* cuda_q_invperm = NULL;
+    uint32_t* cuda_q_scale = NULL;
+    half* cuda_q_scale_max = NULL;
+    uint16_t* cuda_q_groups = NULL;
+    uint16_t* cuda_q_group_map = NULL;
+    uint32_t* cuda_gptq_qzeros = NULL;
+    half* cuda_gptq_scales = NULL;
+
+    half* temp_dq;
+
+    bool failed;
+
+    QMatrix
+    (
+        const int _device,
+        const int _height,
+        const int _width,
+        const int _groups,
+
+        uint32_t* _q_weight,
+        uint16_t* _q_perm,
+        uint16_t* _q_invperm,
+        uint32_t* _q_scale,
+        half* _q_scale_max,
+        uint16_t* _q_groups,
+        uint16_t* _q_group_map,
+
+        uint32_t* _gptq_qzeros,
+        half* _gptq_scales,
+        uint32_t* _gptq_g_idx,
+
+        half* _temp_dq
+    );
+
+    ~QMatrix();
+
+    void reconstruct(half* out);
+    bool make_sequential(const uint32_t* cpu_g_idx);
+
+private:
+
+};
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
new file mode 100644
index 00000000..90c18a0c
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
@@ -0,0 +1,103 @@
+#ifndef _qdq_2_cuh
+#define _qdq_2_cuh
+
+#include "qdq_util.cuh"
+#include "../../config.h"
+
+#if QMODE_2BIT == 1
+
+// Permutation:
+//
+// ffddbb99 77553311  eeccaa88 66442200
+
+__forceinline__ __device__ void shuffle_2bit_16
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0];
+    uint32_t qb = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+    {
+        uint32_t qa0 = qa & 0x03;
+        uint32_t qa1 = (qa & 0x0c) >> 2;
+        qa >>= 4;
+        qb |= (qa1 << (i * 2 + 16));
+        qb |= (qa0 << (i * 2));
+    }
+    q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_2bit_16
+(
+    const uint32_t q_0,
+    half2 (&dq)[8],
+    int stride
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y4_  = __float2half_rn(1.0f /  4.0f);
+    const half y16_ = __float2half_rn(1.0f / 16.0f);
+    const half y64_ = __float2half_rn(1.0f / 64.0f);
+    const half2 y4  = __halves2half2(y4_,  y4_);
+    const half2 y16 = __halves2half2(y16_, y16_);
+    const half2 y64 = __halves2half2(y64_, y64_);
+    const half z1_  = __float2half_rn(-1024.0f         - 2.0f);
+    const half z4_  = __float2half_rn(-1024.0f /  4.0f - 2.0f);
+    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 2.0f);
+    const half z64_ = __float2half_rn(-1024.0f / 64.0f - 2.0f);
+    const half2 z1  = __halves2half2(z1_,  z1_);
+    const half2 z4  = __halves2half2(z4_,  z4_);
+    const half2 z16 = __halves2half2(z16_, z16_);
+    const half2 z64 = __halves2half2(z64_, z64_);
+
+    uint32_t qa = q_0;
+    half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
+    half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
+    half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
+    qa >>= 8;
+    half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
+    half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
+    half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
+    half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
+
+    dq[0] = __hadd2(q0.as_half2, z1);
+    dq[1] = __hfma2(q1.as_half2, y4,  z4);
+    dq[2] = __hfma2(q2.as_half2, y16, z16);
+    dq[3] = __hfma2(q3.as_half2, y64, z64);
+    dq[4] = __hadd2(q4.as_half2, z1);
+    dq[5] = __hfma2(q5.as_half2, y4,  z4);
+    dq[6] = __hfma2(q6.as_half2, y16, z16);
+    dq[7] = __hfma2(q7.as_half2, y64, z64);
+}
+
+#else
+
+__forceinline__ __device__ void shuffle_2bit_16
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_2bit_16
+(
+    const uint32_t q_0,
+    half2 (&dq)[8],
+    int stride
+)
+{
+    half dqh[16];
+    for (int i = 0; i < 16; i++) dqh[i] = dq_ns(exb(q_0, i * 2, 0x03), 2);
+
+    for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh
new file mode 100644
index 00000000..10117376
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh
@@ -0,0 +1,169 @@
+#ifndef _qdq_3_cuh
+#define _qdq_3_cuh
+
+#include "qdq_util.cuh"
+#include "../../config.h"
+
+#if QMODE_3BIT == 1
+
+// Permutation:
+//
+// v9997775 55333111  u8886664 44222000  (u, v lsb)
+// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+// vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+__forceinline__ __device__ void shuffle_3bit_32
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0 * stride];
+    uint32_t qb = q[1 * stride];
+    uint32_t qc = q[2 * stride];
+
+    // qa: aa999888 77766655  54443332 22111000
+    // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
+    // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
+
+    uint32_t qd = qc >> 26;
+    qc <<= 4;
+    qc |= qb >> 28;
+    qb <<= 2;
+    qb |= qa >> 30;
+
+    // qa: ..999888 77766655  54443332 22111000
+    // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
+    // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
+    // qd:                               vvvuuu
+
+    uint32_t za = 0;
+    uint32_t zb = 0;
+    uint32_t zc = 0;
+
+    for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); }
+    for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); }
+    for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); }
+
+    // za:  9997775 55333111   8886664 44222000
+    // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
+    // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
+    // qd:                               vvvuuu
+
+    za |= ((qd & 0x01) >> 0) << 15;
+    zb |= ((qd & 0x02) >> 1) << 15;
+    zc |= ((qd & 0x04) >> 2) << 15;
+    za |= ((qd & 0x08) >> 3) << 31;
+    zb |= ((qd & 0x10) >> 4) << 31;
+    zc |= ((qd & 0x20) >> 5) << 31;
+
+    // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
+    // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+    // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+    q[0 * stride] = za;
+    q[1 * stride] = zb;
+    q[2 * stride] = zc;
+}
+
+__forceinline__ __device__ void dequant_3bit_32
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    const uint32_t q_2,
+    half2 (&dq)[16],
+    int stride
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y8_  = __float2half_rn(1.0f /  8.0f);
+    const half y64_ = __float2half_rn(1.0f / 64.0f);
+    const half2 y8  = __halves2half2(y8_,  y8_);
+    const half2 y64 = __halves2half2(y64_, y64_);
+    const half z1_  = __float2half_rn(-1024.0f         - 4.0f);
+    const half z8_  = __float2half_rn(-1024.0f /  8.0f - 4.0f);
+    const half z64_ = __float2half_rn(-1024.0f / 64.0f - 4.0f);
+    const half2 z1  = __halves2half2(z1_,  z1_);
+    const half2 z8  = __halves2half2(z8_,  z8_);
+    const half2 z64 = __halves2half2(z64_, z64_);
+
+    uint32_t qa = q_0;
+    uint32_t qb = q_1;
+    uint32_t qc = q_2;
+
+    half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) *  8 + 1024
+    qa >>= 6;
+    half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5])      + 1024
+    half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) *  8 + 1024
+    half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024
+    qa >>= 9;
+    qa &= 0x00010001;
+    half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11])      + 1024
+    half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) *  8 + 1024
+    qb >>= 6;
+    half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15])      + 1024
+    half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) *  8 + 1024
+    half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024
+    qb >>= 8;
+    qb &= 0x00020002;
+    half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21])      + 1024
+    half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) *  8 + 1024
+    qc >>= 6;
+    half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25])      + 1024
+    half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) *  8 + 1024
+    half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024
+    qc >>= 7;
+    qc &= 0x00040004;
+    half2_uint32 q15((qa | qb | qc) | c0);
+
+    dq[ 0] = __hadd2( q0.as_half2, z1);
+    dq[ 1] = __hfma2( q1.as_half2, y8,  z8);
+    dq[ 2] = __hadd2( q2.as_half2, z1);
+    dq[ 3] = __hfma2( q3.as_half2, y8,  z8);
+    dq[ 4] = __hfma2( q4.as_half2, y64, z64);
+    dq[ 5] = __hadd2( q5.as_half2, z1);
+    dq[ 6] = __hfma2( q6.as_half2, y8,  z8);
+    dq[ 7] = __hadd2( q7.as_half2, z1);
+    dq[ 8] = __hfma2( q8.as_half2, y8,  z8);
+    dq[ 9] = __hfma2( q9.as_half2, y64, z64);
+    dq[10] = __hadd2(q10.as_half2, z1);
+    dq[11] = __hfma2(q11.as_half2, y8,  z8);
+    dq[12] = __hadd2(q12.as_half2, z1);
+    dq[13] = __hfma2(q13.as_half2, y8,  z8);
+    dq[14] = __hfma2(q14.as_half2, y64, z64);
+    dq[15] = __hadd2(q15.as_half2, z1);
+}
+
+#else
+
+__forceinline__ __device__ void shuffle_3bit_32
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_3bit_32
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    const uint32_t q_2,
+    half2 (&dq)[16],
+    int stride
+)
+{
+    half dqh[32];
+    for (int i = 0; i < 10; i++) dqh[     i] = dq_ns(exb(     q_0, i * 3    , 0x07), 4);
+                                 dqh[10    ] = dq_ns(exb(q_1, q_0,        30, 0x07), 4);
+    for (int i = 0; i < 10; i++) dqh[11 + i] = dq_ns(exb(     q_1, i * 3 + 1, 0x07), 4);
+                                 dqh[21    ] = dq_ns(exb(q_2, q_1,        31, 0x07), 4);
+    for (int i = 0; i < 10; i++) dqh[22 + i] = dq_ns(exb(     q_2, i * 3 + 2, 0x07), 4);
+
+    for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
new file mode 100644
index 00000000..ad95edb4
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
@@ -0,0 +1,227 @@
+#ifndef _qdq_4_cuh
+#define _qdq_4_cuh
+
+#include "qdq_util.cuh"
+#include "../../config.h"
+
+#if QMODE_4BIT == 1
+
+// Permutation:
+//
+// 77775555 33331111  66664444 22220000
+
+__forceinline__ __device__ void shuffle_4bit_8
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0];
+    uint32_t qb = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++)
+    {
+        uint32_t qa0 = qa & 0x0f;
+        uint32_t qa1 = (qa & 0xf0) >> 4;
+        qa >>= 8;
+        qb |= (qa1 << (i * 4 + 16));
+        qb |= (qa0 << (i * 4));
+    }
+    q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_4bit_8
+(
+    const uint32_t q_0,
+    half2 (&dq)[4],
+    int stride
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y16_ = __float2half_rn(1.0f / 16.0f);
+    const half2 y16 = __halves2half2(y16_, y16_);
+    const half z1_  = __float2half_rn(-1024.0f         - 8.0f);
+    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f);
+    const half2 z1  = __halves2half2(z1_,  z1_);
+    const half2 z16 = __halves2half2(z16_, z16_);
+
+    uint32_t qa = q_0;
+    half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024
+    qa >>= 8;
+    half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5])      + 1024
+    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024
+
+    dq[0] = __hadd2(q0.as_half2, z1);
+    dq[1] = __hfma2(q1.as_half2, y16, z16);
+    dq[2] = __hadd2(q2.as_half2, z1);
+    dq[3] = __hfma2(q3.as_half2, y16, z16);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
+(
+    const uint32_t zero,
+    const half scale,
+    half2 (&z1z16)[2],
+    half2 (&y1y16)[2]
+)
+{
+    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
+    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+    half2 scale2 = __half2half2(scale);
+
+    z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
+    z1z16[1] = __hmul2(scale2, __half2half2(z16));
+
+    const half y1 = __float2half_rn(1.0f);
+    const half y16 = __float2half_rn(1.0f / 16.0f);
+
+    y1y16[0] = __hmul2(scale2, __half2half2(y1));
+    y1y16[1] = __hmul2(scale2, __half2half2(y16));
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero
+(
+    const uint32_t zero,
+    half2(&z1z16)[2],
+    half2(&y1y16)[2]
+)
+{
+    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
+    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+
+    z1z16[0] = __half2half2(z1.as_half);
+    z1z16[1] = __half2half2(z16);
+
+    const half y1 = __float2half_rn(1.0f);
+    const half y16 = __float2half_rn(1.0f / 16.0f);
+
+    y1y16[0] = __half2half2(y1);
+    y1y16[1] = __half2half2(y16);
+}
+
+
+__forceinline__ __device__ void dequant_4bit_8_gptq
+(
+    const uint32_t q_0,
+    half2 (&dq)[4],
+    half2 (&z1z16)[2],
+    half2 (&y1y16)[2],
+    int stride,
+    bool scaled
+)
+{
+    const uint32_t c0 = 0x64006400;
+
+    uint32_t qa = q_0;
+    half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0]      + 1024, q[1]      + 1024 )
+    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
+    qa >>= 8;
+    half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4]      + 1024, q[5]      + 1024 )
+    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
+
+    if (scaled)
+    {
+        dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
+        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
+        dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
+        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
+    }
+    else
+    {
+        dq[0] = __hadd2(q0.as_half2,           z1z16[0]);  // half2( q[0] - z, q[1] - z )
+        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] - z, q[3] - z )
+        dq[2] = __hadd2(q2.as_half2,           z1z16[0]);  // half2( q[4] - z, q[5] - z )
+        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);  // half2( q[6] - z, q[7] - z )
+    }
+}
+
+#else
+
+__forceinline__ __device__ void shuffle_4bit_8
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_4bit_8
+(
+    const uint32_t q_0,
+    half2 (&dq)[4],
+    int stride
+)
+{
+    half dqh[8];
+    for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8);
+
+    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
+(
+    const uint32_t zero,
+    const half scale,
+    half2 (&z1)[2],
+    half2 (&y1)[2]
+)
+{
+    half z = __int2half_rn(-((int)zero));
+    z = __hmul(z, scale);
+    z1[0] = __half2half2(z);
+    y1[0] = __half2half2(scale);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_prep_zero
+(
+    const uint32_t zero,
+    half2(&z1)[2],
+    half2(&y1)[2]
+)
+{
+    half z = __int2half_rn(-((int)zero));
+    z1[0] = __half2half2(z);
+}
+
+__forceinline__ __device__ void dequant_4bit_8_gptq
+(
+    const uint32_t q_0,
+    half2 (&dq)[4],
+    half2 (&z1)[2],
+    half2 (&y1)[2],
+    int stride,
+    bool scaled
+)
+{
+    half2 dqh2[8];
+
+    uint32_t qa = q_0;
+    for (int i = 0; i < 4; i++)
+    {
+        half d0 = __int2half_rn(qa & 0x0f); qa >>= 4;
+        half d1 = __int2half_rn(qa & 0x0f); qa >>= 4;
+        dqh2[i] = __halves2half2(d0, d1);
+    }
+
+    if (scaled)
+    {
+        dq[0] = __hfma2(dqh2[0], y1[0], z1[0]);
+        dq[1] = __hfma2(dqh2[1], y1[0], z1[0]);
+        dq[2] = __hfma2(dqh2[2], y1[0], z1[0]);
+        dq[3] = __hfma2(dqh2[3], y1[0], z1[0]);
+    }
+    else
+    {
+        dq[0] = __hadd2(dqh2[0], z1[0]);
+        dq[1] = __hadd2(dqh2[1], z1[0]);
+        dq[2] = __hadd2(dqh2[2], z1[0]);
+        dq[3] = __hadd2(dqh2[3], z1[0]);
+    }
+}
+
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
new file mode 100644
index 00000000..78d81f92
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
@@ -0,0 +1,207 @@
+#ifndef _qdq_5_cuh
+#define _qdq_5_cuh
+
+#include "qdq_util.cuh"
+#include "../../config.h"
+
+#if QMODE_5BIT == 1
+
+// Permutation:
+//
+// v5555533 33311111  u4444422 22200000  (u, v lsb)
+// vbbbbb99 99977777  uaaaaa88 88866666
+// vhhhhhff fffddddd  ugggggee eeeccccc
+// vnnnnnll llljjjjj  ummmmmkk kkkiiiii
+// vtttttrr rrrppppp  usssssqq qqqooooo
+
+__forceinline__ __device__ void shuffle_5bit_32
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0 * stride];
+    uint32_t qb = q[1 * stride];
+    uint32_t qc = q[2 * stride];
+    uint32_t qd = q[3 * stride];
+    uint32_t qe = q[4 * stride];
+
+    // qa: 66555554 44443333  32222211 11100000
+    // qb: ccccbbbb baaaaa99  99988888 77777666
+    // qc: jiiiiihh hhhggggg  fffffeee eedddddc
+    // qd: pppooooo nnnnnmmm  mmlllllk kkkkjjjj
+    // qe: vvvvvuuu uuttttts  ssssrrrr rqqqqqpp
+
+    uint32_t qf = qe >> 22;
+    qe <<= 8;
+    qe |= qd >> 24;
+    qd <<= 6;
+    qd |= qc >> 26;
+    qc <<= 4;
+    qc |= qb >> 28;
+    qb <<= 2;
+    qb |= qa >> 30;
+
+    // qa:   555554 44443333  32222211 11100000
+    // qb:   bbbbba aaaa9999  98888877 77766666
+    // qc:   hhhhhg ggggffff  feeeeedd dddccccc
+    // qd:   nnnnnm mmmmllll  lkkkkkjj jjjiiiii
+    // qe:   ttttts ssssrrrr  rqqqqqpp pppooooo
+    // qf:                          vv vvvuuuuu
+
+    uint32_t za = 0;
+    uint32_t zb = 0;
+    uint32_t zc = 0;
+    uint32_t zd = 0;
+    uint32_t ze = 0;
+
+    for (int i = 0; i < 3; i++) { uint32_t t0 = qa & 0x1f; uint32_t t1 = (qa & 0x3e0) >> 5; qa >>= 10; za |= (t0 << (i * 5)); za |= (t1 << (i * 5 + 16)); }
+    for (int i = 0; i < 3; i++) { uint32_t t0 = qb & 0x1f; uint32_t t1 = (qb & 0x3e0) >> 5; qb >>= 10; zb |= (t0 << (i * 5)); zb |= (t1 << (i * 5 + 16)); }
+    for (int i = 0; i < 3; i++) { uint32_t t0 = qc & 0x1f; uint32_t t1 = (qc & 0x3e0) >> 5; qc >>= 10; zc |= (t0 << (i * 5)); zc |= (t1 << (i * 5 + 16)); }
+    for (int i = 0; i < 3; i++) { uint32_t t0 = qd & 0x1f; uint32_t t1 = (qd & 0x3e0) >> 5; qd >>= 10; zd |= (t0 << (i * 5)); zd |= (t1 << (i * 5 + 16)); }
+    for (int i = 0; i < 3; i++) { uint32_t t0 = qe & 0x1f; uint32_t t1 = (qe & 0x3e0) >> 5; qe >>= 10; ze |= (t0 << (i * 5)); ze |= (t1 << (i * 5 + 16)); }
+
+    // za:  5555533 33311111   4444422 22200000
+    // zb:  bbbbb99 99977777   aaaaa88 88866666
+    // zc:  hhhhhff fffddddd   gggggee eeeccccc
+    // zd:  nnnnnll llljjjjj   mmmmmkk kkkiiiii
+    // ze:  tttttrr rrrppppp   sssssqq qqqooooo
+    // qf:                          vv vvvuuuuu
+
+    za |= ((qf & 0x001) >> 0) << 15;
+    zb |= ((qf & 0x002) >> 1) << 15;
+    zc |= ((qf & 0x004) >> 2) << 15;
+    zd |= ((qf & 0x008) >> 3) << 15;
+    ze |= ((qf & 0x010) >> 4) << 15;
+    za |= ((qf & 0x020) >> 5) << 31;
+    zb |= ((qf & 0x040) >> 6) << 31;
+    zc |= ((qf & 0x080) >> 7) << 31;
+    zd |= ((qf & 0x100) >> 8) << 31;
+    ze |= ((qf & 0x200) >> 9) << 31;
+
+    // za: v5555533 33311111  u4444422 22200000  (u, v lsb)
+    // zb: vbbbbb99 99977777  uaaaaa88 88866666
+    // zc: vhhhhhff fffddddd  ugggggee eeeccccc
+    // zd: vnnnnnll llljjjjj  ummmmmkk kkkiiiii
+    // ze: vtttttrr rrrppppp  usssssqq qqqooooo
+
+    q[0 * stride] = za;
+    q[1 * stride] = zb;
+    q[2 * stride] = zc;
+    q[3 * stride] = zd;
+    q[4 * stride] = ze;
+}
+
+__forceinline__ __device__ void dequant_5bit_32
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    const uint32_t q_2,
+    const uint32_t q_3,
+    const uint32_t q_4,
+    half2 (&dq)[16],
+    int stride
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y32_ = __float2half_rn(1.0f / 32.0f);
+    const half2 y32 = __halves2half2(y32_, y32_);
+    const half z1_  = __float2half_rn(-1024.0f         - 16.0f);
+    const half z32_ = __float2half_rn(-1024.0f / 32.0f - 16.0f);
+    const half2 z1  = __halves2half2(z1_,  z1_);
+    const half2 z32 = __halves2half2(z32_, z32_);
+
+    uint32_t qa = q_0;
+    uint32_t qb = q_1;
+    uint32_t qc = q_2;
+    uint32_t qd = q_3;
+    uint32_t qe = q_4;
+
+    half2_uint32 q0 ((qa & 0x001f001f) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1 ((qa & 0x03e003e0) | c0); // half2(q[ 2], q[ 3]) * 32 + 1024
+    qa >>= 10;
+    half2_uint32 q2 ((qa & 0x001f001f) | c0); // half2(q[ 4], q[ 5])      + 1024
+    qa >>= 5;
+    qa &= 0x00010001;
+    half2_uint32 q3 ((qb & 0x001f001f) | c0); // half2(q[ 6], q[ 7])      + 1024
+    half2_uint32 q4 ((qb & 0x03e003e0) | c0); // half2(q[ 8], q[ 9]) * 32 + 1024
+    qb >>= 10;
+    half2_uint32 q5 ((qb & 0x001f001f) | c0); // half2(q[10], q[11])      + 1024
+    qb >>= 4;
+    qb &= 0x00020002;
+    half2_uint32 q6 ((qc & 0x001f001f) | c0); // half2(q[12], q[13])      + 1024
+    half2_uint32 q7 ((qc & 0x03e003e0) | c0); // half2(q[14], q[15]) * 32 + 1024
+    qc >>= 10;
+    half2_uint32 q8 ((qc & 0x001f001f) | c0); // half2(q[16], q[17])      + 1024
+    qc >>= 3;
+    qc &= 0x00040004;
+    half2_uint32 q9 ((qd & 0x001f001f) | c0); // half2(q[18], q[19])      + 1024
+    half2_uint32 q10((qd & 0x03e003e0) | c0); // half2(q[20], q[21]) * 32 + 1024
+    qd >>= 10;
+    half2_uint32 q11((qd & 0x001f001f) | c0); // half2(q[22], q[23])      + 1024
+    qd >>= 2;
+    qd &= 0x00080008;
+    half2_uint32 q12((qe & 0x001f001f) | c0); // half2(q[24], q[25])      + 1024
+    half2_uint32 q13((qe & 0x03e003e0) | c0); // half2(q[26], q[27]) * 32 + 1024
+    qe >>= 10;
+    half2_uint32 q14((qe & 0x001f001f) | c0); // half2(q[28], q[29])      + 1024
+    qe >>= 1;
+    qe &= 0x00100010;
+    half2_uint32 q15((qa | qb | qc | qd | qe) | c0);
+
+    dq[ 0] = __hadd2( q0.as_half2, z1);
+    dq[ 1] = __hfma2( q1.as_half2, y32, z32);
+    dq[ 2] = __hadd2( q2.as_half2, z1);
+    dq[ 3] = __hadd2( q3.as_half2, z1);
+    dq[ 4] = __hfma2( q4.as_half2, y32, z32);
+    dq[ 5] = __hadd2( q5.as_half2, z1);
+    dq[ 6] = __hadd2( q6.as_half2, z1);
+    dq[ 7] = __hfma2( q7.as_half2, y32, z32);
+    dq[ 8] = __hadd2( q8.as_half2, z1);
+    dq[ 9] = __hadd2( q9.as_half2, z1);
+    dq[10] = __hfma2(q10.as_half2, y32, z32);
+    dq[11] = __hadd2(q11.as_half2, z1);
+    dq[12] = __hadd2(q12.as_half2, z1);
+    dq[13] = __hfma2(q13.as_half2, y32, z32);
+    dq[14] = __hadd2(q14.as_half2, z1);
+    dq[15] = __hadd2(q15.as_half2, z1);
+}
+
+#else
+
+__forceinline__ __device__ void shuffle_5bit_32
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_5bit_32
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    const uint32_t q_2,
+    const uint32_t q_3,
+    const uint32_t q_4,
+    half2 (&dq)[16],
+    int stride
+)
+{
+    half dqh[32];
+    for (int i = 0; i <  6; i++) dqh[     i] = dq_ns(exb(     q_0, i * 5    , 0x1f), 16);
+                                 dqh[ 6    ] = dq_ns(exb(q_1, q_0,        30, 0x1f), 16);
+    for (int i = 0; i <  5; i++) dqh[ 7 + i] = dq_ns(exb(     q_1, i * 5 + 3, 0x1f), 16);
+                                 dqh[12    ] = dq_ns(exb(q_2, q_1,        28, 0x1f), 16);
+    for (int i = 0; i <  6; i++) dqh[13 + i] = dq_ns(exb(     q_2, i * 5 + 1, 0x1f), 16);
+                                 dqh[19    ] = dq_ns(exb(q_3, q_2,        31, 0x1f), 16);
+    for (int i = 0; i <  5; i++) dqh[20 + i] = dq_ns(exb(     q_3, i * 5 + 4, 0x1f), 16);
+                                 dqh[25    ] = dq_ns(exb(q_4, q_3,        29, 0x1f), 16);
+    for (int i = 0; i <  6; i++) dqh[26 + i] = dq_ns(exb(     q_4, i * 5 + 2, 0x1f), 16);
+
+    for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
new file mode 100644
index 00000000..562fe695
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
@@ -0,0 +1,42 @@
+#ifndef _qdq_6_cuh
+#define _qdq_6_cuh
+
+#include "qdq_util.cuh"
+#include "../../config.h"
+
+#if QMODE_6BIT == 1
+
+  // Not implemented
+
+#else
+
+__forceinline__ __device__ void shuffle_6bit_16
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_6bit_16
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    const uint32_t q_2,
+    half2 (&dq)[8],
+    int stride
+)
+{
+    half dqh[16];
+    for (int i = 0; i < 5; i++) dqh[     i] = dq_ns(exb(     q_0, i * 6    , 0x3f), 32);
+                                dqh[ 5    ] = dq_ns(exb(q_1, q_0,        30, 0x3f), 32);
+    for (int i = 0; i < 4; i++) dqh[ 6 + i] = dq_ns(exb(     q_1, i * 6 + 4, 0x3f), 32);
+                                dqh[10    ] = dq_ns(exb(q_2, q_1,        28, 0x3f), 32);
+    for (int i = 0; i < 5; i++) dqh[11 + i] = dq_ns(exb(     q_2, i * 6 + 2, 0x3f), 32);
+
+    for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
new file mode 100644
index 00000000..6e6bedbd
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
@@ -0,0 +1,38 @@
+#ifndef _qdq_8_cuh
+#define _qdq_8_cuh
+
+#include "qdq_util.cuh"
+#include "../../config.h"
+
+#if QMODE_8BIT == 1
+
+  // Not implemented
+
+#else
+
+__forceinline__ __device__ void shuffle_8bit_4
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_8bit_8
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    half2 (&dq)[4],
+    int stride
+)
+{
+    half dqh[8];
+    for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), 128);
+    for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), 128);
+
+    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+#endif
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh
new file mode 100644
index 00000000..cac9df9c
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh
@@ -0,0 +1,53 @@
+#ifndef _qdq_util_cuh
+#define _qdq_util_cuh
+
+union half2_uint32
+{
+    uint32_t as_uint32;
+    half2 as_half2;
+    __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
+    __device__ half2_uint32(half2 val) : as_half2(val) {}
+    __device__ half2_uint32() : as_uint32(0) {}
+};
+
+union half_uint16
+{
+    uint16_t as_uint16;
+    half as_half;
+    __device__ half_uint16(uint16_t val) : as_uint16(val) {}
+    __device__ half_uint16(half val) : as_half(val) {}
+    __device__ half_uint16() : as_uint16(0) {}
+};
+
+// Max_scale premultiplied by 1/256
+
+__forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
+{
+    int qs_i = qs + 1;
+    half qs_h = __int2half_rn(qs_i * qs_i);
+    qs_h = __hmul(qs_h, max_scale);
+    return qs_h;
+}
+
+__forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
+{
+    return __hmul(__int2half_rn(q - qzero), scale);
+}
+
+__forceinline__ __device__ half dq_ns(const int q, const int qzero)
+{
+    //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
+    return __int2half_rn(q - qzero);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
+{
+    return (int)((q >> shift) & mask);
+}
+
+__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
+{
+    return (int)(__funnelshift_rc(q0, q1, shift) & mask);
+}
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
new file mode 100644
index 00000000..e167bc23
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
@@ -0,0 +1,54 @@
+#ifndef _util_cuh
+#define _util_cuh
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+#include <ATen/cuda/CUDAContext.h>
+
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+#define DBGS(__x) printf("%s\n", __x)
+#define DBGI(__x) printf("%s: %i\n", #__x, __x)
+#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
+#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGX(__x) printf("%s: %x\n", #__x, __x)
+#define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y)
+#define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGF(__x) printf("%s: %f\n", #__x, __x)
+#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
+#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x))
+#define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y))
+#define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z))
+
+#define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y))
+#define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z))
+
+__forceinline__ __device__ half dq_scale_(const int qs, const half max_scale)
+{
+    half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f));
+    qs_h = __hmul(qs_h, qs_h);
+    qs_h = __hmul(qs_h, max_scale);
+    return qs_h;
+}
+
+__forceinline__ __device__ float clamp(float x, float a, float b)
+{
+    return fmaxf(a, fminf(b, x));
+}
+
+#define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
+inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+void print_global_mem(const half* ptr, int rows, int columns, int stride);
+
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
new file mode 100644
index 00000000..ff4e1851
--- /dev/null
+++ b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
@@ -0,0 +1,139 @@
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+
+#include "config.h"
+
+#include "cuda/q_matrix.cuh"
+#include "cuda/q_gemm.cuh"
+
+#include "cpp/util.h"
+
+// Some decluttering macros
+
+#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
+#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
+#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
+#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
+
+
+// Quant matrix
+
+uintptr_t make_q_matrix
+(
+    torch::Tensor q_weight,
+    torch::Tensor q_perm,
+    torch::Tensor q_invperm,
+    torch::Tensor q_scale,
+    torch::Tensor q_scale_max,
+    torch::Tensor q_groups,
+    torch::Tensor q_group_map,
+    torch::Tensor gptq_qzeros,
+    torch::Tensor gptq_scales,
+    torch::Tensor gptq_g_idx,
+    torch::Tensor temp_dq
+)
+{
+    TORCH_CHECK_DTYPE(q_weight, kInt);
+    TORCH_CHECK_DTYPE_OPT(q_perm, kShort);
+    TORCH_CHECK_DTYPE_OPT(q_invperm, kShort);
+    TORCH_CHECK_DTYPE_OPT(q_scale, kInt);
+    TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf);
+    TORCH_CHECK_DTYPE_OPT(q_groups, kShort);
+    TORCH_CHECK_DTYPE_OPT(q_group_map, kShort);
+    TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt);
+    TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf);
+    TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt);
+
+    TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1);
+
+    int device = q_weight.device().index();
+    int width = q_weight.size(1);
+    int groups;
+    int height;
+
+    if (!q_scale.device().is_meta())
+    {
+        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
+        TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1);
+        groups = q_scale.size(0);
+        height = q_invperm.size(0);
+    }
+    else
+    {
+        TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8);
+        TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1);
+        groups = gptq_qzeros.size(0);
+        height = q_weight.size(0) * 8;
+    }
+
+    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
+
+    QMatrix* m = new QMatrix
+    (
+        device,
+        height,
+        width,
+        groups,
+        (uint32_t*) q_weight.data_ptr(),
+        q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(),
+        q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(),
+        q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(),
+        q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(),
+        q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(),
+        q_group_map.device().is_meta() ? NULL : (uint16_t*) q_group_map.data_ptr(),
+        gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(),
+        gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(),
+        gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(),
+        (half*) temp_dq.data_ptr()
+    );
+
+    if (m->failed) throw std::runtime_error("CUDA out of memory");
+
+    return reinterpret_cast<uintptr_t> (m);
+}
+
+void gemm_half_q_half
+(
+    torch::Tensor a,
+    uintptr_t b,
+    torch::Tensor c,
+    bool force_cuda
+)
+{
+    QMatrix* qm = reinterpret_cast<QMatrix*> (b);
+
+    TORCH_CHECK_DTYPE(a, kHalf);
+    TORCH_CHECK_DTYPE(c, kHalf);
+    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
+    TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes")
+    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+
+    gemm_half_q_half_cuda
+    (
+        at::cuda::getCurrentCUDABlasHandle(),
+        (const half*) a.data_ptr(),
+        qm,
+        (half*) c.data_ptr(),
+        c.size(0), // m
+        c.size(1), // n
+        a.size(1), // k
+        true,
+        NULL,
+        force_cuda
+    );
+}
+
+// Bindings
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("make_q_matrix", &make_q_matrix, "make_q_matrix");
+    m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half");
+}
diff --git a/server/exllamav2_kernels/setup.py b/server/exllamav2_kernels/setup.py
new file mode 100644
index 00000000..4a16b546
--- /dev/null
+++ b/server/exllamav2_kernels/setup.py
@@ -0,0 +1,28 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import torch
+
+extra_cuda_cflags = ["-lineinfo", "-O3"]
+
+if torch.version.hip:
+    extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF"]
+
+extra_compile_args = {
+    "nvcc": extra_cuda_cflags,
+}
+
+setup(
+    name="exllamav2_kernels",
+    ext_modules=[
+        CUDAExtension(
+            name="exllamav2_kernels",
+            sources=[
+                "exllamav2_kernels/ext.cpp",
+                "exllamav2_kernels/cuda/q_matrix.cu",
+                "exllamav2_kernels/cuda/q_gemm.cu",
+            ],
+            extra_compile_args=extra_compile_args,
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/server/marlin/COPYRIGHT b/server/marlin/COPYRIGHT
new file mode 100644
index 00000000..69f3b8e6
--- /dev/null
+++ b/server/marlin/COPYRIGHT
@@ -0,0 +1,20 @@
+These kernels were vendored from VLLM. The Marlin kernels were developed
+by Elias Frantar and extended by Neural Magic.
+
+---
+
+Copyright (C) Marlin.2024 Elias Frantar
+Modified by Neural Magic
+Copyright 2024 The vLLM team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+         http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/server/marlin/marlin_kernels/__init__.pyi b/server/marlin/marlin_kernels/__init__.pyi
new file mode 100644
index 00000000..663984d0
--- /dev/null
+++ b/server/marlin/marlin_kernels/__init__.pyi
@@ -0,0 +1,61 @@
+import torch
+
+def gptq_marlin_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_scales: torch.Tensor,
+    g_idx: torch.Tensor,
+    perm: torch.Tensor,
+    workspace: torch.Tensor,
+    num_bits: int,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+    is_k_full: bool,
+) -> torch.Tensor:
+    """
+    Matrix multiplication using Marlin kernels. This is an extension of
+    `marlin_gemm` that supports converted GPTQ kernels.
+    """
+    ...
+
+def gptq_marlin_24_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_meta: torch.Tensor,
+    b_scales: torch.Tensor,
+    workspace: torch.Tensor,
+    num_bits: int,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+) -> torch.Tensor:
+    """
+    Matrix multiplication using Marlin kernels. This is an extension of
+    `marlin_gemm` that supports 2:4 sparsity.
+    """
+    ...
+
+def gptq_marlin_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    """Repack GPTQ parameters for Marlin kernels."""
+    ...
+
+def marlin_gemm(
+    a: torch.Tensor,
+    b_q_weight: torch.Tensor,
+    b_scales: torch.Tensor,
+    workspace: torch.Tensor,
+    size_m: int,
+    size_n: int,
+    size_k: int,
+) -> torch.Tensor:
+    """
+    Matrix multiplication using Marlin kernels.
+    """
+    ...
diff --git a/server/marlin/marlin_kernels/ext.cpp b/server/marlin/marlin_kernels/ext.cpp
new file mode 100644
index 00000000..37eccef6
--- /dev/null
+++ b/server/marlin/marlin_kernels/ext.cpp
@@ -0,0 +1,12 @@
+#include <torch/extension.h>
+
+#include "ext.hh"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gptq_marlin_gemm", &gptq_marlin_gemm,
+        "Marlin gemm with GPTQ compatibility");
+  m.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm, "Marlin sparse 2:4 gemm");
+  m.def("gptq_marlin_repack", &gptq_marlin_repack,
+        "Repack GPTQ parameters for Marlin");
+  m.def("marlin_gemm", &marlin_gemm, "Marlin gemm");
+}
diff --git a/server/marlin/marlin_kernels/ext.hh b/server/marlin/marlin_kernels/ext.hh
new file mode 100644
index 00000000..d1caaab7
--- /dev/null
+++ b/server/marlin/marlin_kernels/ext.hh
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <torch/library.h>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+
+torch::Tensor gptq_marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                               torch::Tensor &b_scales, torch::Tensor &g_idx,
+                               torch::Tensor &perm, torch::Tensor &workspace,
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full);
+
+torch::Tensor gptq_marlin_24_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                                  torch::Tensor &b_meta,
+                                  torch::Tensor &b_scales,
+                                  torch::Tensor &workspace, int64_t num_bits,
+                                  int64_t size_m, int64_t size_n,
+                                  int64_t size_k);
+
+torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits);
+
+torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                          torch::Tensor &b_scales, torch::Tensor &workspace,
+                          int64_t size_m, int64_t size_n, int64_t size_k);
+
+#endif
diff --git a/server/marlin/marlin_kernels/gptq_marlin.cu b/server/marlin/marlin_kernels/gptq_marlin.cu
new file mode 100644
index 00000000..0beb9de1
--- /dev/null
+++ b/server/marlin/marlin_kernels/gptq_marlin.cu
@@ -0,0 +1,1870 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adapted from https://github.com/IST-DASLab/marlin
+ */
+
+#include "gptq_marlin.cuh"
+#include "gptq_marlin_dtypes.cuh"
+
+#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
+  static_assert(std::is_same<scalar_t, half>::value ||          \
+                    std::is_same<scalar_t, nv_bfloat16>::value, \
+                "only float16 and bfloat16 is supported");
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace gptq_marlin {
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {}
+
+template <typename scalar_t,          // compute dtype, half or nv_float16
+          const int num_bits,         // number of bits used for weights
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int* locks       // extra global storage for barrier synchronization
+) {}
+
+}  // namespace gptq_marlin
+
+torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                               torch::Tensor& b_scales, torch::Tensor& g_idx,
+                               torch::Tensor& perm, torch::Tensor& workspace,
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full) {
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+template <typename scalar_t>
+__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
+                           const typename ScalarType<scalar_t>::FragB& frag_b,
+                           typename ScalarType<scalar_t>::FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if constexpr (std::is_same<scalar_t, half>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  } else {
+    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  }
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+template <typename scalar_t>
+__device__ inline void ldsm4(typename ScalarType<scalar_t>::FragA& frag_a,
+                             const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385
+template <typename scalar_t>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant_4bit(int q) {
+  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+}
+
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant_4bit<half>(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  typename ScalarType<half>::FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant_4bit<nv_bfloat16>(int q) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+  q >>= 4;
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
+
+  typename ScalarType<nv_bfloat16>::FragB frag_b;
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC308C308;
+
+  frag_b[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  frag_b[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi),
+                      *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                      *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  return frag_b;
+}
+
+// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or
+// bf16 Reference:
+// - FP16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85
+// - BF16:
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175
+template <typename scalar_t>
+__device__ inline typename ScalarType<scalar_t>::FragB dequant_8bit(int q) {
+  STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+}
+
+template <>
+__device__ inline typename ScalarType<half>::FragB dequant_8bit<half>(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  typename ScalarType<half>::FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+template <>
+__device__ inline typename ScalarType<nv_bfloat16>::FragB
+dequant_8bit<nv_bfloat16>(int q) {
+  typename ScalarType<nv_bfloat16>::FragB frag_b;
+
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388736.f;
+  fp32_intermediates[1] -= 8388736.f;
+  fp32_intermediates[2] -= 8388736.f;
+  fp32_intermediates[3] -= 8388736.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(&frag_b);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+template <typename scalar_t>
+__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
+                             typename ScalarType<scalar_t>::FragS& frag_s,
+                             int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s =
+      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Same as above, but for act_order (each K is multiplied individually)
+template <typename scalar_t>
+__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
+                              typename ScalarType<scalar_t>::FragS& frag_s_1,
+                              typename ScalarType<scalar_t>::FragS& frag_s_2,
+                              typename ScalarType<scalar_t>::FragS& frag_s_3,
+                              typename ScalarType<scalar_t>::FragS& frag_s_4,
+                              int i) {
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  scalar_t2 s_val_1_2;
+  s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
+  s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
+
+  scalar_t2 s_val_3_4;
+  s_val_3_4.x = reinterpret_cast<scalar_t*>(&frag_s_3)[i];
+  s_val_3_4.y = reinterpret_cast<scalar_t*>(&frag_s_4)[i];
+
+  frag_b[0] = __hmul2(frag_b[0], s_val_1_2);
+  frag_b[1] = __hmul2(frag_b[1], s_val_3_4);
+}
+
+// Given 2 floats multiply by 2 scales (halves)
+template <typename scalar_t>
+__device__ inline void scale_float(float* c,
+                                   typename ScalarType<scalar_t>::FragS& s) {
+  scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
+  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  int start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = finish_row - start_row;
+
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int offset = row * row_stride;
+
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+template <typename scalar_t,          // compute dtype, half or nv_float16
+          const int num_bits,         // number of bits used for weights
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const bool has_act_order,    // whether act_order is enabled
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
+                                          // (k/groupsize)xn
+    const int* __restrict__ g_idx,        // int32 group indices of shape k
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int* locks       // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+  using Dtype = ScalarType<scalar_t>;
+  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+  using FragA = typename ScalarType<scalar_t>::FragA;
+  using FragB = typename ScalarType<scalar_t>::FragB;
+  using FragC = typename ScalarType<scalar_t>::FragC;
+  using FragS = typename ScalarType<scalar_t>::FragS;
+
+  constexpr int pack_factor = 32 / num_bits;
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  if constexpr (!has_act_order && group_blocks != -1) {
+    if (group_blocks >= thread_k_blocks) {
+      // Ensure that the number of tiles in each stripe is a multiple of the
+      // groupsize; this avoids an annoying special case where a stripe starts
+      // in the middle of group.
+      iters = (group_blocks / thread_k_blocks) *
+              div_ceil(iters, (group_blocks / thread_k_blocks));
+    }
+  }
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * div_ceil(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = div_ceil(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // A sizes/strides
+
+  // stride of the A matrix in global memory
+  int a_gl_stride = prob_k / 8;
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
+
+  // B sizes/strides
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  // Scale sizes/strides without act_order
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_tb_groups =
+      !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
+          ? thread_k_blocks / group_blocks
+          : 1;
+  constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Scale size/strides with act_order
+  constexpr int tb_k = 16 * thread_k_blocks;
+  constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0;
+  // constexpr int act_s_row_stride      = 1;
+  // int           act_s_col_stride      = act_s_row_stride * num_groups;
+  int act_s_col_stride = 1;
+  int act_s_col_warp_stride = act_s_col_stride * 8;
+  int tb_n_warps = thread_n_blocks / 4;
+  int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  // For act_order
+  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
+  int slice_k_start = tb_k * slice_row;
+  int slice_k_finish = slice_k_start + tb_k * slice_iters;
+  int slice_k_start_shared_fetch = slice_k_start;
+  int slice_n_offset = act_s_col_tb_stride * slice_col;
+
+  // No act_order
+  int s_gl_rd;
+  if constexpr (!has_act_order) {
+    if constexpr (group_blocks == -1) {
+      s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+    }
+  }
+  int s_sh_wr = threadIdx.x;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  int s_sh_rd;
+  if constexpr (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_g_idx = sh_b + (stages * b_sh_stage);
+  int4* sh_s = sh_g_idx + (stages * g_idx_stage);
+
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];         // No act-order
+  FragS act_frag_s[2][4][4];  // For act-order
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  int sh_first_group_id = -1;
+  int sh_num_groups = -1;
+  constexpr int sh_max_num_groups = 32;
+
+  auto fetch_scales_to_shared = [&](bool is_async, int first_group_id,
+                                    int last_group_id) {
+    sh_first_group_id = first_group_id;
+    sh_num_groups = last_group_id - first_group_id + 1;
+
+    if (sh_num_groups < sh_max_num_groups) {
+      sh_num_groups = sh_max_num_groups;
+    }
+
+    if (sh_first_group_id + sh_num_groups > num_groups) {
+      sh_num_groups = num_groups - sh_first_group_id;
+    }
+
+    int row_offset = first_group_id * s_gl_stride;
+
+    if (is_async) {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x],
+                         &scales_ptr[row_offset + (i * s_gl_stride) +
+                                     slice_n_offset + threadIdx.x]);
+        }
+      }
+    } else {
+      for (int i = 0; i < sh_num_groups; i++) {
+        if (threadIdx.x < s_sh_stride) {
+          sh_s[(i * s_sh_stride) + threadIdx.x] =
+              scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset +
+                         threadIdx.x];
+        }
+      }
+    }
+  };
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+
+      if constexpr (has_act_order) {
+        // Fetch g_idx thread-block portion
+        int full_pipe = a_off;
+        int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe;
+        if (cur_k < prob_k && cur_k < slice_k_finish) {
+          int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+
+          int4 const* cur_g_idx_stage_ptr =
+              reinterpret_cast<int4 const*>(&g_idx[cur_k]);
+
+          if (threadIdx.x < g_idx_stage) {
+            cp_async4_pred(&sh_g_idx_stage[threadIdx.x],
+                           &cur_g_idx_stage_ptr[threadIdx.x]);
+          }
+        }
+      } else {
+        if constexpr (group_blocks != -1) {
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          if constexpr (group_blocks >= thread_k_blocks) {
+            // Only fetch scales if this tile starts a new group
+            if (pipe % (group_blocks / thread_k_blocks) == 0) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          } else {
+            for (int i = 0; i < s_tb_groups; i++) {
+              if (s_sh_wr_pred) {
+                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
+                          &scales_ptr[s_gl_rd]);
+              }
+              s_gl_rd += s_gl_rd_delta;
+            }
+          }
+        }
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4<scalar_t>(frag_a[k % 2][i],
+                      &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+  };
+
+  bool is_same_group[stages];
+  int same_group_id[stages];
+
+  auto init_same_group = [&](int pipe) {
+    if constexpr (!has_act_order) {
+      is_same_group[pipe] = false;
+      same_group_id[pipe] = 0;
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    int group_id_1 = sh_g_idx_int_ptr[0];
+    int group_id_2 = sh_g_idx_int_ptr[tb_k - 1];
+
+    is_same_group[pipe] = group_id_1 == group_id_2;
+    same_group_id[pipe] = group_id_1;
+  };
+
+  auto fetch_scales_to_registers = [&](int k, int full_pipe) {
+    int pipe = full_pipe % stages;
+
+    if constexpr (!has_act_order) {
+      // No act-order case
+      if constexpr (group_blocks != -1) {
+        if constexpr (group_blocks >= thread_k_blocks) {
+          int4* sh_s_stage =
+              sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                                   (pipe / (group_blocks / thread_k_blocks)));
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+        } else {
+          int warp_id = threadIdx.x / 32;
+          int n_warps = thread_n_blocks / 4;
+
+          int warp_row = warp_id / n_warps;
+
+          int cur_k = warp_row * 16;
+          cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+          int k_blocks = cur_k / 16;
+          int cur_group_id = k_blocks / group_blocks;
+
+          int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+
+          reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
+              sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
+        }
+      }
+
+      return;
+    }
+
+    // Act-order case
+
+    // Determine K of the "current" thread-block
+    int cur_k = slice_k_start + tb_k * full_pipe;
+    if (cur_k >= prob_k || cur_k >= slice_k_finish) {
+      return;
+    }
+
+    // Reset (to current thread-block) since we read g_idx portion from the
+    // shared memory
+    cur_k = 0;
+
+    // Progress to current iteration
+    cur_k += k_iter_size * (k % b_sh_wr_iters);
+
+    // Determine "position" inside the thread-block (based on warp and
+    // thread-id)
+    int warp_id = threadIdx.x / 32;
+    int n_warps =
+        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+
+    int warp_row = warp_id / n_warps;
+    int warp_col = warp_id % n_warps;
+
+    cur_k += warp_row * 16;
+
+    int th_id = threadIdx.x % 32;
+    cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
+
+    int s_col_shift =
+        /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) +
+        (th_id / 4) * act_s_col_stride;
+
+    if (is_same_group[pipe]) {
+      if (k % 2 == 0) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride +
+                 s_col_shift];
+      } else {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[(k - 1) % 2][0][0])));
+      }
+
+      for (int i = 1; i < 4; i++) {
+        *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+            *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][0][0])));
+      }
+      return;
+    }
+
+    int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe;
+    int* sh_g_idx_int_ptr = reinterpret_cast<int*>(sh_g_idx_stage);
+
+    constexpr int k_frag_offsets[4] = {0, 1, 8,
+                                       9};  // Tensor core offsets per thread
+
+  #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      int actual_k = cur_k + k_frag_offsets[i];
+
+      int group_id = sh_g_idx_int_ptr[actual_k];
+      int rel_group_id = group_id - sh_first_group_id;
+
+      *(reinterpret_cast<int4*>(&(act_frag_s[k % 2][i][0]))) =
+          sh_s[rel_group_id * s_sh_stride + s_col_shift];
+    }
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+      if constexpr (num_bits == 4) {
+        int b_quant = frag_b_quant[k % 2][0][j];
+        int b_quant_shift = b_quant >> 8;
+
+        frag_b0 = dequant_4bit<scalar_t>(b_quant);
+        frag_b1 = dequant_4bit<scalar_t>(b_quant_shift);
+
+      } else {
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+
+        frag_b0 = dequant_8bit<scalar_t>(b_quant_0);
+        frag_b1 = dequant_8bit<scalar_t>(b_quant_1);
+      }
+
+      // Apply scale to frag_b0
+      if constexpr (has_act_order) {
+        scale4<scalar_t>(frag_b0, act_frag_s[k % 2][0][j],
+                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
+                         act_frag_s[k % 2][3][j], 0);
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale<scalar_t>(frag_b0, frag_s[k % 2][j], 0);
+        }
+      }
+
+      // Apply scale to frag_b1
+      if constexpr (has_act_order) {
+        scale4<scalar_t>(frag_b1, act_frag_s[k % 2][0][j],
+                         act_frag_s[k % 2][1][j], act_frag_s[k % 2][2][j],
+                         act_frag_s[k % 2][3][j], 1);
+
+      } else {
+        if constexpr (group_blocks != -1) {
+          scale<scalar_t>(frag_b1, frag_s[k % 2][j], 1);
+        }
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<scalar_t>(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma<scalar_t>(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<scalar_t*>(&c)[j] =
+                  Dtype::float2num(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      scalar_t2 res =
+          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+
+      // For per-column quantization we finally apply the scale here (only for
+      // 4-bit)
+      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 4) {
+        res = __hmul2(res, s[0]);
+      }
+
+      ((scalar_t2*)sh)[idx] = res;
+    };
+
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) {
+      if (has_act_order && i == 0) {
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]);
+      }
+      fetch_to_shared(i, i, i < slice_iters);
+    }
+
+    zero_accums();
+    wait_for_stage();
+    init_same_group(0);
+    fetch_to_registers(0, 0);
+    fetch_scales_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+    slice_k_start_shared_fetch += tb_k * (stages - 1);
+  };
+  if (slice_iters) {
+    start_pipes();
+  }
+
+  // Main loop.
+  while (slice_iters) {
+    // We unroll over both the global fetch and the register load pipeline to
+    // ensure all shared memory accesses are static. Note that both pipelines
+    // have even length meaning that the next iteration will always start at
+    // index 0.
+
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) {
+        break;
+      }
+    }
+
+    a_gl_rd += a_gl_rd_delta_o * stages;
+    slice_k_start += tb_k * stages;
+    slice_k_start_shared_fetch += tb_k * stages;
+
+    if constexpr (has_act_order) {
+      int first_group_id = g_idx[slice_k_start];
+      int last_g_idx = slice_k_start + stages * tb_k * 2;
+      if (last_g_idx >= prob_k) {
+        last_g_idx = prob_k - 1;
+      }
+      int last_group_id = g_idx[last_g_idx];
+      if (last_group_id >= sh_first_group_id + sh_num_groups) {
+        fetch_scales_to_shared(false, first_group_id, last_group_id);
+        __syncthreads();
+      }
+    }
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if constexpr (num_bits == 8) {
+          if (s_sh_wr_pred) {
+            cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+          }
+          cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
+            }
+            cp_async_fence();
+          }
+        }
+      }
+
+      thread_block_reduce();
+      if constexpr (!has_act_order && group_blocks == -1) {
+        if constexpr (num_bits == 8) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+            reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+          }
+
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+              reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (!has_act_order && group_blocks == -1 && num_bits == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int j = 0; j < 4; j++) {
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][0]),
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][0][2]),
+                  frag_s[j / 2][2 * (j % 2) + 0]);
+
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][1][0]),
+                  frag_s[j / 2][2 * (j % 2) + 1]);
+              scale_float<scalar_t>(
+                  reinterpret_cast<float*>(&frag_c[i][j][1][2]),
+                  frag_s[j / 2][2 * (j % 2) + 1]);
+            }
+          }
+        }
+      }
+
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+
+        // Update slice k/n for scales loading
+        if constexpr (has_act_order) {
+          slice_k_start = tb_k * slice_row;
+          slice_k_finish = slice_k_start + tb_k * slice_iters;
+          slice_k_start_shared_fetch = slice_k_start;
+          slice_n_offset = act_s_col_tb_stride * slice_col;
+
+        } else {
+          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        }
+
+        start_pipes();
+      }
+    }
+  }
+}
+
+  #define __CALL_IF(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,                \
+                    THREAD_K_BLOCKS, HAS_ACT_ORDER, GROUP_BLOCKS, NUM_THREADS) \
+    else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&     \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             has_act_order == HAS_ACT_ORDER && group_blocks == GROUP_BLOCKS && \
+             num_threads == NUM_THREADS) {                                     \
+      cudaFuncSetAttribute(                                                    \
+          Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,             \
+                 THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \
+                 GROUP_BLOCKS>,                                                \
+          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
+      Marlin<scalar_t, NUM_BITS, NUM_THREADS, THREAD_M_BLOCKS,                 \
+             THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER,     \
+             GROUP_BLOCKS><<<blocks, NUM_THREADS, max_shared_mem, stream>>>(   \
+          A_ptr, B_ptr, C_ptr, s_ptr, g_idx_ptr, num_groups, prob_m, prob_n,   \
+          prob_k, locks);                                                      \
+    }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+typedef struct {
+  int max_m_blocks;
+  thread_config_t tb_cfg;
+} exec_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},
+    {64, 128, 128},
+    {128, 64, 128},
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},
+    {64, 128, 128},
+    {128, 64, 128},
+
+};
+
+int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
+                          int prob_n, int prob_k, int num_bits, int group_size,
+                          bool has_act_order, bool is_k_full) {
+  bool cache_scales_chunk = has_act_order && !is_k_full;
+
+  int tb_n = th_config.thread_n;
+  int tb_k = th_config.thread_k;
+
+  // Get max scale groups per thread-block
+  int tb_groups;
+  if (group_size == -1) {
+    tb_groups = 1;
+  } else if (group_size == 0) {
+    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
+  } else {
+    tb_groups = div_ceil(tb_k, group_size);
+  }
+
+  if (cache_scales_chunk) {
+    int load_groups =
+        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
+    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
+    return load_groups * tb_n * 2;
+
+  } else {
+    int tb_scales = tb_groups * tb_n * 2;
+
+    return tb_scales * pipe_stages;
+  }
+}
+
+bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
+                         int prob_m, int prob_n, int prob_k, int num_bits,
+                         int scales_cache_size, int max_shared_mem) {
+  int pack_factor = 32 / num_bits;
+
+  // Get B size
+  int tb_k = th_config.thread_k;
+  int tb_n = th_config.thread_n;
+
+  int b_size = (tb_k * tb_n / pack_factor) * 4;
+
+  // Get A size
+  int m_blocks = div_ceil(prob_m, 16);
+  int tb_max_m = 16;
+
+  while (true) {
+    if (m_blocks >= max_m_blocks) {
+      tb_max_m *= max_m_blocks;
+      break;
+    }
+
+    max_m_blocks--;
+    if (max_m_blocks == 0) {
+      TORCH_CHECK(false, "Unexpected m_blocks = ", m_blocks);
+    }
+  }
+
+  int a_size = (tb_max_m * tb_k) * 2;
+
+  float pipe_size = (a_size + b_size) * pipe_stages;
+
+  TORCH_CHECK(max_shared_mem / 2 > scales_cache_size);  // Sanity
+
+  return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
+}
+
+bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,
+                     int prob_m, int prob_n, int prob_k, int num_bits,
+                     int group_size, bool has_act_order, bool is_k_full,
+                     int max_shared_mem) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  //  Determine cache for scales
+  int scales_cache_size =
+      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
+                            group_size, has_act_order, is_k_full);
+
+  // Check that pipeline fits into cache
+  if (!is_valid_cache_size(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                           num_bits, scales_cache_size, max_shared_mem)) {
+    return false;
+  }
+
+  return true;
+}
+
+exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
+                                      int num_bits, int group_size,
+                                      bool has_act_order, bool is_k_full,
+                                      int max_shared_mem) {
+  int max_m_blocks = 4;
+  while (max_m_blocks > 0) {
+    if (prob_m <= 16) {
+      for (auto th_config : small_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, has_act_order, is_k_full,
+                            max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    } else {
+      for (auto th_config : large_batch_thread_configs) {
+        if (is_valid_config(th_config, max_m_blocks, prob_m, prob_n, prob_k,
+                            num_bits, group_size, has_act_order, is_k_full,
+                            max_shared_mem)) {
+          return exec_config_t{max_m_blocks, th_config};
+        }
+      }
+    }
+
+    max_m_blocks--;  // Process less M blocks per invocation to reduce cache
+                     // usage
+  }
+
+  return exec_config_t{0, {-1, -1, -1}};
+}
+
+  #define CALL_IF(NUM_BITS, N_BLOCKS, K_BLOCKS, NUM_THREADS)           \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS)   \
+                                                                       \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)  \
+                                                                       \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS) \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS)  \
+    __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS)
+
+template <typename scalar_t>
+void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
+                     void* g_idx, void* perm, void* a_tmp, int prob_m,
+                     int prob_n, int prob_k, void* workspace, int num_bits,
+                     bool has_act_order, bool is_k_full, int num_groups,
+                     int group_size, int dev, cudaStream_t stream, int thread_k,
+                     int thread_n, int sms, int max_par) {
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  int tot_m = prob_m;
+  int tot_m_blocks = div_ceil(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  // Set thread config
+  exec_config_t exec_cfg;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    exec_cfg =
+        exec_config_t{4, thread_config_t{thread_k, thread_n, default_threads}};
+  } else {
+    // Auto config
+    exec_cfg =
+        determine_thread_config(prob_m, prob_n, prob_k, num_bits, group_size,
+                                has_act_order, is_k_full, max_shared_mem);
+  }
+
+  TORCH_CHECK(exec_cfg.max_m_blocks > 0 &&
+                  is_valid_config(exec_cfg.tb_cfg, exec_cfg.max_m_blocks,
+                                  prob_m, prob_n, prob_k, num_bits, group_size,
+                                  has_act_order, is_k_full, max_shared_mem),
+              "Invalid thread config: max_m_blocks = ", exec_cfg.max_m_blocks,
+              ", thread_k = ", exec_cfg.tb_cfg.thread_k,
+              ", thread_n = ", exec_cfg.tb_cfg.thread_n,
+              ", num_threads = ", exec_cfg.tb_cfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", max_shared_mem = ", max_shared_mem);
+
+  int num_threads = exec_cfg.tb_cfg.num_threads;
+  thread_k = exec_cfg.tb_cfg.thread_k;
+  thread_n = exec_cfg.tb_cfg.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+
+  int blocks = sms;
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+
+  int group_blocks = 0;
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(group_size != -1);
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    } else {
+      TORCH_CHECK(group_size == 0);
+      group_blocks = 0;
+    }
+
+  } else {
+    if (group_size == -1) {
+      group_blocks = -1;
+    } else {
+      group_blocks = group_size / 16;
+      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                  " is not divisible by group_blocks = ", group_blocks);
+    }
+  }
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+  const int* g_idx_ptr = (const int*)g_idx;
+  const int* perm_ptr = (const int*)perm;
+  int4* a_tmp_ptr = (int4*)a_tmp;
+
+  int* locks = (int*)workspace;
+
+  if (has_act_order) {
+    // Permute A columns
+    int block_rows = div_ceil(prob_m, blocks);
+    permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
+    A_ptr = a_tmp_ptr;
+  }
+
+  // If we have a full K, then we can run the non-act-order version of Marlin
+  // (since the weight rows are reordered by increasing group ids, and by having
+  // a full K, we have full original groups)
+  if (is_k_full) {
+    has_act_order = false;
+  }
+
+  // Main loop
+  for (int i = 0; i < tot_m_blocks; i += exec_cfg.max_m_blocks) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > exec_cfg.max_m_blocks) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / (16 * exec_cfg.max_m_blocks);
+      if (par > max_par) par = max_par;
+      prob_m = (16 * exec_cfg.max_m_blocks) * par;
+      i += exec_cfg.max_m_blocks * (par - 1);
+      thread_m_blocks = exec_cfg.max_m_blocks;
+    }
+
+    // Define kernel configurations
+    if (false) {
+    }
+    CALL_IF(4, 32, 2, 256)
+    CALL_IF(4, 16, 4, 256)
+    CALL_IF(4, 8, 8, 256)
+    CALL_IF(4, 8, 4, 128)
+    CALL_IF(4, 4, 8, 128)
+    CALL_IF(8, 32, 2, 256)
+    CALL_IF(8, 16, 4, 256)
+    CALL_IF(8, 8, 8, 256)
+    CALL_IF(8, 8, 4, 128)
+    CALL_IF(8, 4, 8, 128)
+    else {
+      TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " +
+                             str(prob_n) + ", " + str(prob_k) + "]" +
+                             ", has_act_order = " + str(has_act_order) +
+                             ", num_groups = " + str(num_groups) +
+                             ", group_size = " + str(group_size) +
+                             ", thread_m_blocks = " + str(thread_m_blocks) +
+                             ", thread_n_blocks = " + str(thread_n_blocks) +
+                             ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+}  // namespace gptq_marlin
+
+torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                               torch::Tensor& b_scales, torch::Tensor& g_idx,
+                               torch::Tensor& perm, torch::Tensor& workspace,
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full) {
+  // Verify num_bits
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int pack_factor = 32 / num_bits;
+
+  // Verify A
+  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
+              ", size_m = ", size_m);
+  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
+              ", size_k = ", size_k);
+
+  // Verify B
+  TORCH_CHECK(size_k % gptq_marlin::tile_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+  TORCH_CHECK((size_k / gptq_marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", tile_size = ", gptq_marlin::tile_size);
+  TORCH_CHECK(b_q_weight.size(1) % gptq_marlin::tile_size == 0,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not divisible by tile_size = ", gptq_marlin::tile_size);
+  int actual_size_n =
+      (b_q_weight.size(1) / gptq_marlin::tile_size) * pack_factor;
+  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
+              ", actual_size_n = ", actual_size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
+  TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
+
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Verify g_idx and perm
+  TORCH_CHECK((g_idx.size(0) == 0 && perm.size(0) == 0) ||
+                  (g_idx.size(0) == size_k && perm.size(0) == size_k),
+              "Unexpected g_idx.size(0) = ", g_idx.size(0),
+              " and perm.size(0) = ", perm.size(0),
+              ", where size_k = ", size_k);
+
+  // Detect groupsize and act_order
+  int num_groups = -1;
+  int group_size = -1;
+  bool has_act_order = g_idx.size(0) != 0;
+
+  int b_rank = b_scales.sizes().size();
+  TORCH_CHECK(b_rank == 2, "b_scales rank = ", b_rank, " is not 2");
+  TORCH_CHECK(b_scales.size(1) == size_n, "b_scales dim 1 = ", b_scales.size(1),
+              " is not size_n = ", size_n);
+  num_groups = b_scales.size(0);
+
+  if (has_act_order) {
+    if (is_k_full) {
+      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
+      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
+                  ", is not divisible by num_groups = ", num_groups);
+      group_size = size_k / num_groups;
+    } else {
+      group_size = 0;
+    }
+
+  } else {
+    if (num_groups > 1) {
+      TORCH_CHECK(
+          size_k % num_groups == 0, "size_k = ", size_k,
+          ", is not divisible by b_scales.size(0) = ", b_scales.size(0));
+      group_size = size_k / num_groups;
+    } else {
+      group_size = -1;
+    }
+  }
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % gptq_marlin::min_thread_n == 0, "size_n = ", size_n,
+      ", is not divisible by min_thread_n = ", gptq_marlin::min_thread_n);
+  int min_workspace_size =
+      (size_n / gptq_marlin::min_thread_n) * gptq_marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = ", workspace.numel(),
+              " is below min_workspace_size = ", min_workspace_size);
+
+  int dev = a.get_device();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    gptq_marlin::marlin_mm_f16i4<half>(
+        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
+        b_scales.data_ptr<at::Half>(), g_idx.data_ptr(), perm.data_ptr(),
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        workspace.data_ptr(), num_bits, has_act_order, is_k_full, num_groups,
+        group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+        thread_n, sms, gptq_marlin::max_par);
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    gptq_marlin::marlin_mm_f16i4<nv_bfloat16>(
+        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
+        c.data_ptr<at::BFloat16>(), b_scales.data_ptr<at::BFloat16>(),
+        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
+        size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order,
+        is_k_full, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        gptq_marlin::max_par);
+  } else {
+    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
+  }
+
+  return c;
+}
+
+#endif
diff --git a/server/marlin/marlin_kernels/gptq_marlin.cuh b/server/marlin/marlin_kernels/gptq_marlin.cuh
new file mode 100644
index 00000000..42af4495
--- /dev/null
+++ b/server/marlin/marlin_kernels/gptq_marlin.cuh
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+
+namespace gptq_marlin {
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+
+static constexpr int pipe_stages =
+    4;  // 4 pipeline stages fit into shared memory
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+#endif
+
+}  // namespace gptq_marlin
diff --git a/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh b/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh
new file mode 100644
index 00000000..ca1b7099
--- /dev/null
+++ b/server/marlin/marlin_kernels/gptq_marlin_dtypes.cuh
@@ -0,0 +1,77 @@
+
+#ifndef _data_types_cuh
+#define _data_types_cuh
+#include "gptq_marlin.cuh"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace gptq_marlin {
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  // Matrix fragments for tensor core instructions; their precise layout is
+  // documented here:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+  using FragA = Vec<half2, 4>;
+  using FragB = Vec<half2, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<half2, 1>;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+  using FragA = Vec<nv_bfloat162, 4>;
+  using FragB = Vec<nv_bfloat162, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<nv_bfloat162, 1>;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+#endif
+};
+
+}  // namespace gptq_marlin
+
+#endif
diff --git a/server/marlin/marlin_kernels/gptq_marlin_repack.cu b/server/marlin/marlin_kernels/gptq_marlin_repack.cu
new file mode 100644
index 00000000..4adc158e
--- /dev/null
+++ b/server/marlin/marlin_kernels/gptq_marlin_repack.cu
@@ -0,0 +1,350 @@
+#include "gptq_marlin.cuh"
+
+namespace gptq_marlin {
+
+static constexpr int repack_stages = 8;
+
+static constexpr int repack_threads = 256;
+
+static constexpr int tile_k_size = tile_size;
+static constexpr int tile_n_size = tile_k_size * 4;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <int const num_threads, int const num_bits, bool const has_perm>
+__global__ void marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr,
+    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {}
+
+}  // namespace gptq_marlin
+
+torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+template <int const num_threads, int const num_bits, bool const has_perm>
+__global__ void marlin_repack_kernel(
+    uint32_t const* __restrict__ b_q_weight_ptr,
+    uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
+    int size_k, int size_n) {
+  constexpr int pack_factor = 32 / num_bits;
+
+  int k_tiles = size_k / tile_k_size;
+  int n_tiles = size_n / tile_n_size;
+  int block_k_tiles = div_ceil(k_tiles, gridDim.x);
+
+  int start_k_tile = blockIdx.x * block_k_tiles;
+  if (start_k_tile >= k_tiles) {
+    return;
+  }
+
+  int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<repack_stages - 2>();
+    __syncthreads();
+  };
+
+  extern __shared__ int4 sh[];
+
+  constexpr int perm_size = tile_k_size / 4;
+
+  int4* sh_perm_ptr = sh;
+  int4* sh_pipe_ptr = sh_perm_ptr;
+  if constexpr (has_perm) {
+    sh_pipe_ptr += perm_size;
+  }
+
+  constexpr int tile_ints = tile_k_size / pack_factor;
+
+  constexpr int stage_n_threads = tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_size = stage_k_threads * stage_n_threads;
+
+  auto load_perm_to_shared = [&](int k_tile_id) {
+    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+
+    int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
+
+    if (threadIdx.x < perm_size) {
+      sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
+    }
+    __syncthreads();
+  };
+
+  auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      cp_async_fence();
+      return;
+    }
+
+    int first_n = n_tile_id * tile_n_size;
+
+    int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
+
+    if constexpr (has_perm) {
+      if (threadIdx.x < stage_size) {
+        int k_id = threadIdx.x / stage_n_threads;
+        int n_id = threadIdx.x % stage_n_threads;
+
+        uint32_t const* sh_perm_int_ptr =
+            reinterpret_cast<uint32_t const*>(sh_perm_ptr);
+
+        int src_k = sh_perm_int_ptr[k_id];
+        int src_k_packed = src_k / pack_factor;
+
+        cp_async4(
+            &sh_ptr[k_id * stage_n_threads + n_id],
+            reinterpret_cast<int4 const*>(&(
+                b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
+      }
+
+    } else {
+      if (threadIdx.x < stage_size) {
+        int k_id = threadIdx.x / stage_n_threads;
+        int n_id = threadIdx.x % stage_n_threads;
+
+        int first_k = k_tile_id * tile_k_size;
+        int first_k_packed = first_k / pack_factor;
+
+        cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
+                  reinterpret_cast<int4 const*>(
+                      &(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
+                                       first_n + (n_id * 4)])));
+      }
+    }
+
+    cp_async_fence();
+  };
+
+  auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
+    if (n_tile_id >= n_tiles) {
+      return;
+    }
+
+    int warp_id = threadIdx.x / 32;
+    int th_id = threadIdx.x % 32;
+
+    if (warp_id >= 4) {
+      return;
+    }
+
+    int tc_col = th_id / 4;
+    int tc_row = (th_id % 4) * 2;
+
+    constexpr int tc_offsets[4] = {0, 1, 8, 9};
+
+    int cur_n = warp_id * 16 + tc_col;
+
+    constexpr int sh_stride = 64;
+    constexpr uint32_t mask = (1 << num_bits) - 1;
+
+    int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
+    uint32_t* sh_stage_int_ptr = reinterpret_cast<uint32_t*>(sh_stage_ptr);
+
+    uint32_t* sh_perm_int_ptr = reinterpret_cast<uint32_t*>(sh_perm_ptr);
+
+    uint32_t vals[8];
+
+    if constexpr (has_perm) {
+      for (int i = 0; i < 4; i++) {
+        int k_idx = tc_row + tc_offsets[i];
+
+        uint32_t src_k = sh_perm_int_ptr[k_idx];
+        uint32_t src_k_pos = src_k % pack_factor;
+
+        uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
+        uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
+
+        uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
+        uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
+
+        vals[i] = b1_cur_val;
+        vals[4 + i] = b2_cur_val;
+      }
+
+    } else {
+      uint32_t b1_vals[tile_ints];
+      uint32_t b2_vals[tile_ints];
+
+  #pragma unroll
+      for (int i = 0; i < tile_ints; i++) {
+        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        int cur_elem = tc_row + tc_offsets[i];
+        int cur_int = cur_elem / pack_factor;
+        int cur_pos = cur_elem % pack_factor;
+
+        vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+      }
+    }
+
+    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
+
+    // Result of:
+    // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+    if constexpr (num_bits == 4) {
+      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+  #pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else {
+      constexpr int pack_idx[4] = {0, 2, 1, 3};
+
+      uint32_t res1 = 0;
+      uint32_t res2 = 0;
+  #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        res1 |= vals[pack_idx[i]] << (i * 8);
+        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+      }
+
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
+      out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
+    }
+  };
+
+  auto start_pipes = [&](int k_tile_id, int n_tile_id) {
+  #pragma unroll
+    for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
+      fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
+    }
+
+    wait_for_stage();
+  };
+  #pragma unroll
+  for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
+    int n_tile_id = 0;
+
+    if constexpr (has_perm) {
+      load_perm_to_shared(k_tile_id);
+    }
+
+    start_pipes(k_tile_id, n_tile_id);
+
+    while (n_tile_id < n_tiles) {
+  #pragma unroll
+      for (int pipe = 0; pipe < repack_stages; pipe++) {
+        fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
+                        n_tile_id + pipe + repack_stages - 1);
+        repack_tile(pipe, k_tile_id, n_tile_id + pipe);
+        wait_for_stage();
+      }
+      n_tile_id += repack_stages;
+    }
+  }
+}
+
+}  // namespace gptq_marlin
+
+  #define CALL_IF(NUM_BITS, HAS_PERM)                                          \
+    else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                   \
+      cudaFuncSetAttribute(                                                    \
+          gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads,       \
+                                            NUM_BITS, HAS_PERM>,               \
+          cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);        \
+      gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
+                                        HAS_PERM>                              \
+          <<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>(   \
+              b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);              \
+    }
+
+torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+                                 int64_t size_k, int64_t size_n,
+                                 int64_t num_bits) {
+  // Verify compatibility with marlin tile of 16x64
+  TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
+              " is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
+  TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
+              " is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
+
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int const pack_factor = 32 / num_bits;
+
+  // Verify B
+  TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
+              ", size_k = ", size_k, ", pack_factor = ", pack_factor);
+  TORCH_CHECK(b_q_weight.size(1) == size_n,
+              "b_q_weight.size(1) = ", b_q_weight.size(1),
+              " is not size_n = ", size_n);
+
+  // Verify device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+  TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
+
+  TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
+  TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
+  TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
+
+  // Alloc buffers
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
+  auto options = torch::TensorOptions()
+                     .dtype(b_q_weight.dtype())
+                     .device(b_q_weight.device());
+  torch::Tensor out =
+      torch::empty({size_k / gptq_marlin::tile_size,
+                    size_n * gptq_marlin::tile_size / pack_factor},
+                   options);
+
+  // Detect if there is act_order
+  bool has_perm = perm.size(0) != 0;
+
+  // Get ptrs
+  uint32_t const* b_q_weight_ptr =
+      reinterpret_cast<uint32_t const*>(b_q_weight.data_ptr());
+  uint32_t const* perm_ptr = reinterpret_cast<uint32_t const*>(perm.data_ptr());
+  uint32_t* out_ptr = reinterpret_cast<uint32_t*>(out.data_ptr());
+
+  // Get dev info
+  int dev = b_q_weight.get_device();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
+  int blocks;
+  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (false) {
+  }
+  CALL_IF(4, false)
+  CALL_IF(4, true)
+  CALL_IF(8, false)
+  CALL_IF(8, true)
+  else {
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", has_perm = ", has_perm);
+  }
+
+  return out;
+}
+
+#endif
diff --git a/server/marlin/marlin_kernels/marlin_cuda_kernel.cu b/server/marlin/marlin_kernels/marlin_cuda_kernel.cu
new file mode 100644
index 00000000..d124c014
--- /dev/null
+++ b/server/marlin/marlin_kernels/marlin_cuda_kernel.cu
@@ -0,0 +1,1136 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace marlin {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA& a_frag, const FragB& frag_b,
+                           FragC& frag_c) {
+  const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  float* c = reinterpret_cast<float*>(&frag_c);
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+        "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,  // fp16 quantization scales of shape
+                                 // (k/groupsize)xn
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters;  // number of threadblock tiles in the current slice
+  int slice_count =
+      0;          // total number of active threadblocks in the current slice
+  int slice_idx;  // index of threadblock in current slice; numbered bottom to
+                  // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 8;  // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      8;  // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o);  // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride *
+      (threads / a_gl_rd_delta_o);  // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      2 * ((threads / 32) /
+           (thread_n_blocks / 4));  // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16;  // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks);  // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta);  // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_sh_stage = s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+  int s_sh_wr = threadIdx.x;
+  int s_sh_rd;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  if (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s = sh_b + (stages * b_sh_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
+        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+        s_gl_rd += s_gl_rd_delta;
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if (group_blocks != -1) {
+      int4* sh_s_stage =
+          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4*>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+      FragB frag_b0 = dequant(b_quant);
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if (group_blocks != -1) scale(frag_b0, frag_s[k % 2][j], 0);
+      FragB frag_b1 = dequant(b_quant_shift);
+      if (group_blocks != -1) scale(frag_b1, frag_s[k % 2][j], 1);
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(
+              &sh[c_sh_wr + c_sh_wr_delta * i],
+              &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                 c_gl_wr_delta_i * (i % 2)],
+              i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float*>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half*>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half*>(&c)[j] =
+                  __float2half(reinterpret_cast<float*>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS& s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+      if (group_blocks ==
+          -1)  // for per-column quantization we finally apply the scale here
+        res = __hmul2(res, s[0]);
+      ((half2*)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+  #pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (group_blocks == -1 && last) {
+        if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (group_blocks == -1 && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      }
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+        }
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin(
+    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C,        // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,  // fp16 quantization scales of shape
+                                 // (k/groupsize)xn
+    int prob_m,                  // batch dimension m
+    int prob_n,                  // output dimension n
+    int prob_k,                  // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;               // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4;  // 4 pipeline stages fit into shared memory
+const int SHARED_MEM =
+    96 * 1024;  // max shared memory on compute capability 8.6 (< 8.0)
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8;  // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         SHARED_MEM);                                          \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
+        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256},  // Default
+    {128, 64, 128},   // Reduce N 2X, same K
+    {64, 256, 256},   // Reduce K 2X, increase N 2X
+    {64, 128, 128},   // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},   // Default
+    {128, 128, 256},  // Reduce N 2X, increase K 2X
+    {64, 128, 128},   // Reduce N 2X, same K
+    {128, 64, 128},   // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const& th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)    \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)  \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
+                 int prob_n, int prob_k, void* workspace, int groupsize = -1,
+                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
+                 int thread_n = -1, int sms = -1, int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  // Uncomment for debug
+  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
+  //                  ", thread_n = " + str(th_config.thread_n) +
+  //                  ", num_threads = " + str(th_config.num_threads) + " for
+  //                  MKN = [" + str(prob_m) +
+  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+
+  int* locks = (int*)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par) par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+}  // namespace marlin
+
+torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                          torch::Tensor& b_scales, torch::Tensor& workspace,
+                          int64_t size_m, int64_t size_n, int64_t size_k) {
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % marlin::tile_size == 0,
+              "size_k = " + str(size_k) +
+                  " is not divisible by tile_size = " + str(marlin::tile_size));
+  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(marlin::tile_size));
+
+  // Verify N
+  TORCH_CHECK(b_scales.size(1) == size_n,
+              "b_scales.size(1) = " + str(b_scales.size(1)) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
+              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+                  " is not divisible by tile_size = " + str(marlin::tile_size));
+
+  int actual_size_n =
+      (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
+  TORCH_CHECK(
+      size_n == actual_size_n,
+      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify scales device and strides
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize
+  if (b_scales.size(0) != 1) {
+    TORCH_CHECK(size_k % b_scales.size(0) == 0,
+                "size_k = " + str(size_k) +
+                    ", is not divisible by b_scales.size(0) = " +
+                    str(b_scales.size(0)));
+  }
+  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
+
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % marlin::min_thread_n == 0,
+      "size_n = " + str(size_n) +
+          ", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
+  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  int dev = a.get_device();
+  marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
+                      b_scales.data_ptr(), size_m, size_n, size_k,
+                      workspace.data_ptr(), groupsize, dev,
+                      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
+                      sms, marlin::max_par);
+
+  return c;
+}
diff --git a/server/marlin/marlin_kernels/py.typed b/server/marlin/marlin_kernels/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/server/marlin/marlin_kernels/sparse/common/base.h b/server/marlin/marlin_kernels/sparse/common/base.h
new file mode 100644
index 00000000..16018d33
--- /dev/null
+++ b/server/marlin/marlin_kernels/sparse/common/base.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace marlin_24 {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+
+template <int M_, int N_, int K_>
+struct ShapeBase {
+  static constexpr int M = M_, N = N_, K = K_;
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragM = Vec<uint, 1>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>;  // quantization scales
+
+}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/common/mem.h b/server/marlin/marlin_kernels/sparse/common/mem.h
new file mode 100644
index 00000000..83e3578d
--- /dev/null
+++ b/server/marlin/marlin_kernels/sparse/common/mem.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+
+namespace marlin_24 {
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred_zfill(void* smem_ptr,
+                                            const void* glob_ptr,
+                                            bool pred = true,
+                                            const bool zfill = false) {
+  const int BYTES = 16;
+  int src_in_bytes = (zfill ? 0 : BYTES);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES), "r"(src_in_bytes));
+}
+
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+__device__ inline void ldsm4_m(FragM& frag_m, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_m);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n"
+               : "=r"(a[0]), "=r"(a[1])
+               : "r"(smem));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4_t(FragA& frag_a, const void* smem_ptr) {
+  uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+      : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+      : "r"(smem));
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int* lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int* lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/common/mma.h b/server/marlin/marlin_kernels/sparse/common/mma.h
new file mode 100644
index 00000000..b26505f7
--- /dev/null
+++ b/server/marlin/marlin_kernels/sparse/common/mma.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "base.h"
+#include <cudaTypedefs.h>
+
+namespace marlin_24 {
+
+// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
+// is not supported. On later versions of CUDA the version without ordered
+// metadata results in the following warning:
+//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
+//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
+//  | reduced performance on some future architectures
+#if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  #define MMA_SP_INST \
+    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#else
+  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#endif
+
+// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
+                              const FragA& frag_b, FragC& frag_c, FragM& frag_m,
+                              const int psel) {
+  const uint32_t* a0 = reinterpret_cast<const uint32_t*>(&a_frag0);
+  const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
+  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
+  const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
+
+  float* c = reinterpret_cast<float*>(&frag_c);
+  if (psel == 0) {
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
+  } else {
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
+    asm volatile(MMA_SP_INST
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
+  }
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+__device__ __forceinline__ uint2 to_half4(float c0, float c1, float c2,
+                                          float c3) {
+  uint2 r;
+  asm("{\n\t"
+      ".reg .f16 a, b, c, d; \n\t"
+      "cvt.rn.f16.f32 a, %2; \n\t"
+      "cvt.rn.f16.f32 b, %3; \n\t"
+      "cvt.rn.f16.f32 c, %4; \n\t"
+      "cvt.rn.f16.f32 d, %5; \n\t"
+      "mov.b32 %0, {a, b};   \n\t"
+      "mov.b32 %1, {c, d};   \n\t"
+      "}"
+      : "=r"(r.x), "=r"(r.y)
+      : "f"(c0), "f"(c1), "f"(c2), "f"(c3));
+  return r;
+}
+
+// Constructs destination register by taking bytes from 2 sources (based on
+// mask)
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant_4bit(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&MUL),
+                      *reinterpret_cast<const half2*>(&ADD));
+  return frag_b;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant_8bit(int q) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480;
+
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  frag_b[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                      *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB& frag_b, FragS& frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half*>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+__device__ inline void scale_floats(float* c0, float* c1, float* c2, float* c3,
+                                    FragS& s0, float* c4, float* c5, float* c6,
+                                    float* c7, FragS& s1) {
+  *c0 = __fmul_rn(*c0, __half2float(s0[0].x));
+  *c1 = __fmul_rn(*c1, __half2float(s0[0].y));
+  *c2 = __fmul_rn(*c2, __half2float(s0[1].x));
+  *c3 = __fmul_rn(*c3, __half2float(s0[1].y));
+
+  *c4 = __fmul_rn(*c4, __half2float(s1[0].x));
+  *c5 = __fmul_rn(*c5, __half2float(s1[0].y));
+  *c6 = __fmul_rn(*c6, __half2float(s1[1].x));
+  *c7 = __fmul_rn(*c7, __half2float(s1[1].y));
+}
+
+}  // namespace marlin_24
diff --git a/server/marlin/marlin_kernels/sparse/marlin_24_cuda_kernel.cu b/server/marlin/marlin_kernels/sparse/marlin_24_cuda_kernel.cu
new file mode 100644
index 00000000..b5effc30
--- /dev/null
+++ b/server/marlin/marlin_kernels/sparse/marlin_24_cuda_kernel.cu
@@ -0,0 +1,1125 @@
+/*
+ * Notice: This file was modified by Neuralmagic inc to include 8-bit support
+ *
+ * Copyright (C) 2024 Roberto Lopez Castro (roberto.lopez.castro@udc.es). All
+ * Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+#include "common/base.h"
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+#else
+
+  #include "common/mem.h"
+  #include "common/mma.h"
+
+#endif
+
+template <typename T>
+inline std::string str(T x) {
+  return std::to_string(x);
+}
+
+namespace marlin_24 {
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int THREADS = 256;
+static constexpr int STAGES = 4;
+
+static constexpr int min_thread_n = 128;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 64;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+template <const int num_bits,         // weight bits
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin_24(
+    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
+    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
+                                    // format on B
+    int4* __restrict__ C,           // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,     // fp16 quantization scales of shape
+                                    // (k/groupsize)xn
+    int prob_m,                     // batch dimension m
+    int prob_n,                     // output dimension n
+    int prob_k,                     // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {}
+
+torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                                  torch::Tensor& b_meta,
+                                  torch::Tensor& b_scales,
+                                  torch::Tensor& workspace, int64_t num_bits,
+                                  int64_t size_m, int64_t size_n,
+                                  int64_t size_k) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "gptq_marlin_24_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+
+template <const int num_bits,         // weight bits
+          const int threads,          // number of threads in a threadblock
+          const int thread_m_blocks,  // number of 16x16 blocks in the m
+                                      // dimension (batchsize) of the
+                                      // threadblock
+          const int thread_n_blocks,  // same for n dimension (output)
+          const int thread_k_blocks,  // same for k dimension (reduction)
+          const int stages,  // number of stages for the async global->shared
+                             // fetch pipeline
+          const int group_blocks = -1  // number of consecutive 16x16 blocks
+                                       // with a separate quantization scale
+          >
+__global__ void Marlin_24(
+    const int4* __restrict__ A,     // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,     // 4bit quantized weight matrix of shape kxn
+    const int4* __restrict__ meta,  // 2bit metadata information about 2:4
+                                    // format on B
+    int4* __restrict__ C,           // fp16 output buffer of shape mxn
+    const int4* __restrict__ s,     // fp16 quantization scales of shape
+                                    // (k/groupsize)xn
+    int prob_m,                     // batch dimension m
+    int prob_n,                     // output dimension n
+    int prob_k,                     // reduction dimension k
+    int* locks  // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  // number of thread_k_blocks in k-dim
+  int k_tiles = prob_k / 32 / thread_k_blocks;
+  // number of thread_n_blocks in n-dim
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  // iters needed to cover all slices
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  // number of threadblock tiles in the current slice
+  int slice_iters;
+  // total number of active threadblocks in the current slice
+  int slice_count = 0;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx;
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters == 0) return;
+    if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0) slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0) slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  // RLC: 8 is vec_size -> 128-bit instructions, 8 fp16 elements
+  int a_gl_stride = prob_k / 8;  // stride of the A matrix in global memory
+
+  // stride of an A matrix tile in shared memory
+  constexpr int a_sh_stride = 32 * thread_k_blocks / 8;
+  // delta between subsequent A tiles in global memory
+  constexpr int a_gl_rd_delta_o = 32 * thread_k_blocks / 8;
+  // between subsequent accesses within a tile
+  int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory writes
+  constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
+  // between shared memory tile reads //RLC: 2 * #warps k-dim
+  constexpr int a_sh_rd_delta_o = 4 * ((threads / 32) / (thread_n_blocks / 4));
+  // within a shared memory tile
+  constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
+  // overall size of a tile
+  constexpr int a_sh_stage = a_sh_stride * (16 * thread_m_blocks);
+  // number of shared write iterations for a tile
+  constexpr int a_sh_wr_iters = ceildiv(a_sh_stage, a_sh_wr_delta);
+
+  constexpr int pack_factor = 32 / num_bits;
+
+  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
+  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
+  constexpr int b_thread_vecs = num_bits == 4 ? 1 : 2;
+  constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  constexpr int b_sh_wr_delta = threads * b_thread_vecs;
+  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  int m_gl_stride = 2 * prob_n / 8;  // (16*2*4 / 8) = 16
+  constexpr int m_sh_stride =
+      (16 * thread_n_blocks) / 4;  // #warps n-dim * threads/warp
+  int m_gl_rd_delta_o = m_gl_stride * thread_k_blocks;
+  int m_gl_rd_delta_i = m_gl_stride * (threads / m_sh_stride);
+  constexpr int m_sh_wr_delta = threads / 2;
+  constexpr int m_sh_rd_delta = threads / 2;
+  constexpr int m_sh_stage = m_sh_stride * thread_k_blocks;
+  constexpr int m_sh_iters = ceildiv(m_sh_stage, m_sh_wr_delta);
+
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_sh_stage = s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 4 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
+                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x * b_thread_vecs;
+  int b_sh_rd = threadIdx.x * b_thread_vecs;
+
+  int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
+                (threadIdx.x % (m_sh_stride));
+  m_gl_rd += (m_sh_stride)*slice_col;
+  m_gl_rd += m_gl_rd_delta_o * slice_row;
+  int m_sh_wr = threadIdx.x;
+  int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
+
+  int s_gl_rd;
+  if constexpr (group_blocks == -1) {
+    s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+  } else {
+    s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+              s_sh_stride * slice_col + threadIdx.x;
+  }
+
+  int s_sh_wr = threadIdx.x;
+  int s_sh_rd;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  if (group_blocks != -1) {
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  } else {
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  }
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++) {
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  }
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[2][b_sh_wr_iters][thread_m_blocks];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++) {
+      a_sh_rd_trans[0][i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[1][i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd + 2);
+    }
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4* B_ptr[b_sh_wr_iters];
+  #pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  bool m_sh_wr_pred = threadIdx.x < m_sh_wr_delta;
+  const int4* meta_ptr[m_sh_iters];
+  #pragma unroll
+  for (int i = 0; i < m_sh_iters; i++)
+    meta_ptr[i] = meta + m_gl_rd_delta_i * i + m_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4* sh_a = sh;
+  int4* sh_b = sh_a + (stages * a_sh_stage);
+  int4* sh_s = sh_b + (stages * b_sh_stage);
+  int4* sh_m = sh_s + (stages * s_sh_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks][2];
+  I4 frag_b_quant[2][b_thread_vecs];
+  FragM frag_m[2][2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float*>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+  #pragma unroll
+        for (int j = 0; j < b_thread_vecs; j++) {
+          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
+        }
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      int4* sh_meta_stage = sh_m + m_sh_stage * pipe;
+  #pragma unroll
+      for (int i = 0; i < m_sh_iters; i++) {
+        if (m_sh_wr_pred)
+          cp_async4(&sh_meta_stage[m_sh_wr_delta * i + m_sh_wr], meta_ptr[i]);
+        meta_ptr[i] += m_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
+        int4* sh_s_stage = sh_s + s_sh_stage * pipe;
+        if (s_sh_wr_pred) cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+        s_gl_rd += s_gl_rd_delta;
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if (group_blocks != -1) {
+      int4* sh_s_stage =
+          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+    }
+    int4* sh_a_stage = sh_a + a_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++) {
+      ldsm4(frag_a[k % 2][i][0],
+            &sh_a_stage[a_sh_rd_trans[0][k % b_sh_wr_iters][i]]);
+      ldsm4(frag_a[k % 2][i][1],
+            &sh_a_stage[a_sh_rd_trans[1][k % b_sh_wr_iters][i]]);
+    }
+
+    int4* sh_b_stage = sh_b + b_sh_stage * pipe;
+  #pragma unroll
+    for (int i = 0; i < b_thread_vecs; i++) {
+      frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
+          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+    }
+
+    // Load meta with ldsm4
+    int4* sh_m_stage = sh_m + m_sh_stage * pipe;
+    ldsm4_m(frag_m[k % 2][0],
+            &sh_m_stage[m_sh_rd_delta * (k % m_sh_iters) + m_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+  // We have the m dimension as the inner loop in order to encourage overlapping
+  // dequantization and matmul operations.
+  #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      FragB frag_b0;
+      FragB frag_b1;
+
+      if constexpr (num_bits == 4) {
+        int b_quant = frag_b_quant[k % 2][0][j];
+        int b_quant_shift = b_quant >> 8;
+
+        frag_b0 = dequant_4bit(b_quant);
+        frag_b1 = dequant_4bit(b_quant_shift);
+
+      } else {
+        int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k % 2]);
+        int b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
+        int b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
+
+        frag_b0 = dequant_8bit(b_quant_0);
+        frag_b1 = dequant_8bit(b_quant_1);
+      }
+
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if constexpr (group_blocks != -1) {
+        scale(frag_b0, frag_s[k % 2][j], 0);
+      }
+      if constexpr (group_blocks != -1) {
+        scale(frag_b1, frag_s[k % 2][j], 1);
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma_sp(frag_b0, frag_b1, frag_a[k % 2][i][0], frag_c[i][j][0],
+               frag_m[k % 2][j / 2], j % 2);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride_threads / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride_threads;
+      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride_threads;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
+                      (threadIdx.x % b_sh_stride_threads);
+
+  // Parallel logarithmic shared memory reduction. We make sure to avoid any
+  // unnecessary read or write iterations, e.g., for two warps we write only
+  // once by warp 1 and read only once by warp 0.
+  #pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+  #pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+  #pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float* c_rd =
+                    reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
+                float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
+  #pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+  #pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float* c_rd =
+                reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
+  #pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped
+  // partitioning minimizes the number of such reductions and our outputs are
+  // usually rather small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 2 * 4 * c_gl_stride;
+      int c_gl_wr_delta_i =
+          c_gl_stride;  // 8 threads (e.g., 0,4,8,12,16,20,24,28)
+      int c_gl_wr = 2 * c_gl_stride * (threadIdx.x % 4) +
+                    8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int col = 2 * ((threadIdx.x % 32) % 4);
+
+      if (!first) {
+  // Interestingly, doing direct global accesses here really seems to mess up
+  // the compiler and lead to slowdowns, hence we also use async-copies even
+  // though these fetches are not actually asynchronous.
+  #pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
+                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                            c_gl_wr_delta_i * (i % 2)],
+                         i < (thread_m_blocks - 1) * 4 ||
+                             8 * (i / 2) + col + (i % 2) < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 ||
+            8 * (i / 2) + col + (i % 2) < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+  #pragma unroll
+            for (int j2 = 0; j2 < 2; j2++) {
+  #pragma unroll
+              for (int j1 = 0; j1 < 4; j1++) {
+                reinterpret_cast<float*>(
+                    &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
+                             4 * ((i % 4) / 2) + i % 2] +=
+                    __half2float(
+                        reinterpret_cast<__half*>(&c_red)[(j2 * 4 + j1)]);
+              }
+            }
+          }
+          if (!last) {
+            int4 c;
+  #pragma unroll
+            for (int j2 = 0; j2 < 2; j2++) {
+  #pragma unroll
+              for (int j1 = 0; j1 < 4; j1++) {
+                reinterpret_cast<__half*>(&c)[(j2 * 4 + j1)] =
+                    __float2half(reinterpret_cast<float*>(
+                        &frag_c)[4 * 2 * 4 * (i / 4) + 8 * j1 + 2 * j2 +
+                                 4 * ((i % 4) / 2) + i % 2]);
+              }
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+
+    constexpr int c_sh_stride = 2 * thread_n_blocks;              // RLC:
+    constexpr int c_sh_stride_2 = 2 * c_sh_stride + 2;            // RLC:
+    constexpr int c_sh_stride_3 = 2 * (2 * thread_n_blocks) + 2;  // RLC:
+
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+
+    int c_sh_wr = c_sh_stride_2 * ((threadIdx.x % 32) % 4) +
+                  ((threadIdx.x % 32) / 4);  // RLC:
+    c_sh_wr += 8 * (threadIdx.x / 32);       // 128/4(half4)
+
+    constexpr int c_sh_rd_delta =
+        c_sh_stride_3 * (threads / (2 * 2 * thread_n_blocks));  // RLC:
+    int c_sh_rd = c_sh_stride_3 * (threadIdx.x / (2 * 2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * 2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    auto write = [&](int idx, float c0, float c1, float c2, float c3, FragS& s0,
+                     float c4, float c5, float c6, float c7, FragS& s1) {
+      uint2 res[2];
+      res[0] = to_half4(c0, c1, c2, c3);
+      res[1] = to_half4(c4, c5, c6, c7);
+      half2* tmp = (half2*)&res;
+      // for per-column quantization we finally apply the scale here
+      if constexpr (group_blocks == -1 && num_bits == 4) {
+        tmp[0] = __hmul2(tmp[0], s0[0]);
+        tmp[1] = __hmul2(tmp[1], s0[1]);
+        tmp[2] = __hmul2(tmp[2], s1[0]);
+        tmp[3] = __hmul2(tmp[3], s1[1]);
+      }
+      ((int4*)sh)[idx] = *((int4*)&res[0]);
+    };
+
+    // RLC:  only warp 0 and 1 baseline example
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        int wr = c_sh_wr;
+        write(wr, frag_c[i][0][0][0], frag_c[i][1][0][0], frag_c[i][2][0][0],
+              frag_c[i][3][0][0], frag_s[0][0], frag_c[i][0][0][2],
+              frag_c[i][1][0][2], frag_c[i][2][0][2], frag_c[i][3][0][2],
+              frag_s[0][2]);
+        write(wr + c_sh_stride, frag_c[i][0][0][1], frag_c[i][1][0][1],
+              frag_c[i][2][0][1], frag_c[i][3][0][1], frag_s[0][0],
+              frag_c[i][0][0][3], frag_c[i][1][0][3], frag_c[i][2][0][3],
+              frag_c[i][3][0][3], frag_s[0][2]);
+        write(wr + 4 * c_sh_stride_2, frag_c[i][0][1][0], frag_c[i][1][1][0],
+              frag_c[i][2][1][0], frag_c[i][3][1][0], frag_s[0][0],
+              frag_c[i][0][1][2], frag_c[i][1][1][2], frag_c[i][2][1][2],
+              frag_c[i][3][1][2], frag_s[0][2]);
+        write(wr + 4 * c_sh_stride_2 + c_sh_stride, frag_c[i][0][1][1],
+              frag_c[i][1][1][1], frag_c[i][2][1][1], frag_c[i][3][1][1],
+              frag_s[0][0], frag_c[i][0][1][3], frag_c[i][1][1][3],
+              frag_c[i][2][1][3], frag_c[i][3][1][3], frag_s[0][2]);
+
+        c_sh_wr += 8 * c_sh_stride_2;
+      }
+    }
+    __syncthreads();
+
+  #pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+  #pragma unroll
+    for (int i = 0; i < stages - 1; i++) fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+  // We unroll over both the global fetch and the register load pipeline to
+  // ensure all shared memory accesses are static. Note that both pipelines have
+  // even length meaning that the next iteration will always start at index 0.
+  #pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+      fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                      slice_iters >= stages);
+      matmul(pipe);
+      wait_for_stage();
+
+      fetch_to_registers(pipe + 1, (pipe + 1) % stages);
+
+      pipe++;
+      slice_iters--;
+      if (slice_iters == 0) break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if constexpr (group_blocks == -1) {
+        if constexpr (num_bits == 8) {
+          if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
+          cp_async_fence();
+        } else {
+          if (last) {
+            if (s_sh_wr_pred) cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
+            cp_async_fence();
+          }
+        }
+      }
+      thread_block_reduce();
+
+      if constexpr (group_blocks == -1) {
+        if constexpr (num_bits == 8) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+            *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
+          }
+        } else {
+          if (last) {
+            cp_async_wait<0>();
+            __syncthreads();
+            if (threadIdx.x / 32 < thread_n_blocks / 4) {
+              *(float4*)(frag_s) = *(float4*)(&sh_s[s_sh_rd]);
+            }
+          }
+        }
+      }
+
+      // For 8-bit channelwise, we apply the scale before the global reduction
+      // that converts the fp32 results to fp16 (so that we avoid possible
+      // overflow in fp16)
+      if constexpr (group_blocks == -1 && num_bits == 8) {
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+            scale_floats(&frag_c[i][0][0][0], &frag_c[i][1][0][0],
+                         &frag_c[i][2][0][0], &frag_c[i][3][0][0], frag_s[0][0],
+                         &frag_c[i][0][0][2], &frag_c[i][1][0][2],
+                         &frag_c[i][2][0][2], &frag_c[i][3][0][2],
+                         frag_s[0][2]);
+
+            scale_floats(&frag_c[i][0][0][1], &frag_c[i][1][0][1],
+                         &frag_c[i][2][0][1], &frag_c[i][3][0][1], frag_s[0][0],
+                         &frag_c[i][0][0][3], &frag_c[i][1][0][3],
+                         &frag_c[i][2][0][3], &frag_c[i][3][0][3],
+                         frag_s[0][2]);
+
+            scale_floats(&frag_c[i][0][1][0], &frag_c[i][1][1][0],
+                         &frag_c[i][2][1][0], &frag_c[i][3][1][0], frag_s[0][0],
+                         &frag_c[i][0][1][2], &frag_c[i][1][1][2],
+                         &frag_c[i][2][1][2], &frag_c[i][3][1][2],
+                         frag_s[0][2]);
+
+            scale_floats(&frag_c[i][0][1][1], &frag_c[i][1][1][1],
+                         &frag_c[i][2][1][1], &frag_c[i][3][1][1], frag_s[0][0],
+                         &frag_c[i][0][1][3], &frag_c[i][1][1][3],
+                         &frag_c[i][2][1][3], &frag_c[i][3][1][3],
+                         frag_s[0][2]);
+          }
+        }
+      }
+
+      if (slice_count > 1) {  // only globally reduce if there is more than one
+                              // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last)  // only the last block in a slice actually writes the result
+        write_result();
+
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+  #pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+  #pragma unroll
+        for (int i = 0; i < m_sh_iters; i++)
+          meta_ptr[i] += (m_sh_stride)-m_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+  #pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
+  #pragma unroll
+          for (int i = 0; i < m_sh_iters; i++) meta_ptr[i] -= m_gl_stride;
+        }
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#endif
+
+#define CALL_IF_2_4(NUM_BITS, THREAD_M_BLOCKS, THREAD_N_BLOCKS,               \
+                    THREAD_K_BLOCKS, GROUP_BLOCKS)                            \
+  else if (num_bits == NUM_BITS && thread_m_blocks == THREAD_M_BLOCKS &&      \
+           thread_n_blocks == THREAD_N_BLOCKS &&                              \
+           thread_k_blocks == THREAD_K_BLOCKS &&                              \
+           group_blocks == GROUP_BLOCKS) {                                    \
+    cudaFuncSetAttribute(                                                     \
+        Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,        \
+                  THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,                     \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);         \
+    Marlin_24<NUM_BITS, THREADS, THREAD_N_BLOCKS, THREAD_M_BLOCKS,            \
+              THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>                          \
+        <<<blocks, THREADS, max_shared_mem, stream>>>(A_ptr, B_ptr, meta_ptr, \
+                                                      C_ptr, s_ptr, prob_n,   \
+                                                      prob_m, prob_k, locks); \
+  }
+
+void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
+                     void* s, int prob_m, int prob_n, int prob_k,
+                     void* workspace, int num_bits, int groupsize = -1,
+                     int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
+                     int thread_m = -1, int sms = -1, int max_par = 16) {
+  int tot_n = prob_n;
+  int tot_n_blocks = ceildiv(tot_n, 16);
+  int pad = 16 * tot_n_blocks - tot_n;
+
+  if (sms == -1) {
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  }
+  TORCH_CHECK(sms > 0);
+
+  int max_shared_mem = 0;
+  cudaDeviceGetAttribute(&max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  TORCH_CHECK(max_shared_mem > 0);
+
+  if (thread_k == -1 || thread_m == -1) {
+    if (prob_n <= 16) {
+      // For small batchizes, better partitioningif is slightly more important
+      // than better compute utilization
+      thread_k = 128;
+      thread_m = 128;
+    } else if (prob_n <= 256) {
+      thread_k = 64;
+      thread_m = 256;
+    } else {
+      thread_k = 32;
+      thread_m = 512;
+    }
+  }
+
+  int thread_k_blocks = thread_k / 32;  // 2:4 version with m16n8k32 instruction
+  int thread_m_blocks = thread_m / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  TORCH_CHECK(prob_m % thread_m == 0, "prob_m = ", prob_m,
+              " is not divisible by thread_m = ", thread_m);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK((prob_k / 2) % group_blocks == 0, "prob_k/2 = ", prob_k / 2,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
+              ", ", prob_n, ", ", prob_k, "]");
+
+  const int4* A_ptr = (const int4*)A;
+  const int4* B_ptr = (const int4*)B;
+  const int4* meta_ptr = (const int4*)meta;
+  int4* C_ptr = (int4*)C;
+  const int4* s_ptr = (const int4*)s;
+
+  constexpr int max_m_blocks = 4;
+
+  int* locks = (int*)workspace;
+  for (int i = 0; i < tot_n_blocks; i += max_m_blocks) {
+    int thread_n_blocks = tot_n_blocks - i;
+    prob_n = tot_n - 16 * i;
+    int par = 1;
+    if (thread_n_blocks > max_m_blocks) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_n_blocks - pad) / (max_m_blocks * 16);
+      if (par > max_par) par = max_par;
+      prob_n = (max_m_blocks * 16) * par;
+      i += max_m_blocks * (par - 1);
+      thread_n_blocks = max_m_blocks;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+
+    // the false is start of the CALL_IF macros
+    if (false) {
+    }  //         BMxBNxBK,   group
+    // 4-bit
+    CALL_IF_2_4(4, 8, 1, 4, -1)  // e.g., 16x128x128
+    CALL_IF_2_4(4, 8, 1, 4, 4)   // e.g., 16x128x128, 64
+
+    CALL_IF_2_4(4, 16, 1, 2, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(4, 16, 1, 2, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(4, 16, 2, 2, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(4, 16, 2, 2, 4)
+    CALL_IF_2_4(4, 16, 3, 2, -1)
+    CALL_IF_2_4(4, 16, 3, 2, 4)
+    CALL_IF_2_4(4, 16, 4, 2, -1)
+    CALL_IF_2_4(4, 16, 4, 2, 4)
+
+    CALL_IF_2_4(4, 32, 1, 1, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(4, 32, 1, 1, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(4, 32, 2, 1, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(4, 32, 2, 1, 4)
+    CALL_IF_2_4(4, 32, 3, 1, -1)
+    CALL_IF_2_4(4, 32, 3, 1, 4)
+    CALL_IF_2_4(4, 32, 4, 1, -1)
+    CALL_IF_2_4(4, 32, 4, 1, 4)
+
+    // 8-bit
+    CALL_IF_2_4(8, 8, 1, 4, -1)  // e.g., 16x128x128
+    CALL_IF_2_4(8, 8, 1, 4, 4)   // e.g., 16x128x128, 64
+
+    CALL_IF_2_4(8, 16, 1, 2, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(8, 16, 1, 2, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(8, 16, 2, 2, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(8, 16, 2, 2, 4)
+    CALL_IF_2_4(8, 16, 3, 2, -1)
+    CALL_IF_2_4(8, 16, 3, 2, 4)
+    CALL_IF_2_4(8, 16, 4, 2, -1)
+    CALL_IF_2_4(8, 16, 4, 2, 4)
+
+    CALL_IF_2_4(8, 32, 1, 1, -1)  // e.g., 16x256x64
+    CALL_IF_2_4(8, 32, 1, 1, 4)   // e.g., 16x256x64,  64
+    CALL_IF_2_4(8, 32, 2, 1, -1)  // e.g.. 32x256x64
+    CALL_IF_2_4(8, 32, 2, 1, 4)
+    CALL_IF_2_4(8, 32, 3, 1, -1)
+    CALL_IF_2_4(8, 32, 3, 1, 4)
+    CALL_IF_2_4(8, 32, 4, 1, -1)
+    CALL_IF_2_4(8, 32, 4, 1, 4)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_n_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_n_blocks * (prob_m / 8) * par;
+  }
+}
+
+}  // namespace marlin_24
+
+torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                                  torch::Tensor& b_meta,
+                                  torch::Tensor& b_scales,
+                                  torch::Tensor& workspace, int64_t num_bits,
+                                  int64_t size_m, int64_t size_n,
+                                  int64_t size_k) {
+  // Verify num_bits
+  TORCH_CHECK(num_bits == 4 || num_bits == 8,
+              "num_bits must be 4 or 8. Got = ", num_bits);
+  int pack_factor = 32 / num_bits;
+
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % marlin_24::tile_size == 0,
+              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
+                  str(marlin_24::tile_size));
+  TORCH_CHECK((size_k / marlin_24::tile_size / 2) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(marlin_24::tile_size));
+
+  // Verify N
+  TORCH_CHECK(b_scales.size(1) == size_n,
+              "b_scales.size(1) = " + str(b_scales.size(1)) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(
+      b_q_weight.size(1) % marlin_24::tile_size == 0,
+      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+          " is not divisible by tile_size = " + str(marlin_24::tile_size));
+
+  int actual_size_n = (b_q_weight.size(1) / marlin_24::tile_size) * pack_factor;
+  TORCH_CHECK(
+      size_n == actual_size_n,
+      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
+
+  // Verify meta
+  TORCH_CHECK(b_meta.size(0) == size_k / 8 / 2 / 2,
+              "b_meta.size(0) = ", b_meta.size(0),
+              " is not size_k / 8 / 2 / 2 = ", size_k / 8 / 2 / 2);
+  TORCH_CHECK(b_meta.size(1) == size_n * 2, "b_meta.size(1) = ", b_meta.size(1),
+              " is not size_n * 2 = ", size_n * 2);
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify b_meta device and strides
+  TORCH_CHECK(b_meta.device().is_cuda(), "b_meta is not on GPU");
+  TORCH_CHECK(b_meta.is_contiguous(), "b_meta is not contiguous");
+
+  // Verify scales device and strides
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  int thread_k = -1;
+  int thread_m = -1;
+  int sms = -1;
+  int max_par = marlin_24::max_par;
+
+  int groupsize = -1;
+  if (b_scales.size(0) > 1) {
+    TORCH_CHECK(size_k % b_scales.size(0) == 0,
+                "size_k = " + str(size_k) +
+                    ", is not divisible by b_scales.size(0) = " +
+                    str(b_scales.size(0)));
+    groupsize = size_k / b_scales.size(0);
+    groupsize /= 2;  // Because of 24
+  }
+
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 64,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify workspace size
+  TORCH_CHECK(size_n % marlin_24::min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " +
+                  str(marlin_24::min_thread_n));
+  int min_workspace_size =
+      (size_n / marlin_24::min_thread_n) * marlin_24::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  int dev = a.get_device();
+  marlin_24::marlin_cuda_2_4(
+      a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
+      b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
+      num_bits, groupsize, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
+      thread_m, sms, max_par);
+
+  return c;
+}
diff --git a/server/marlin/setup.py b/server/marlin/setup.py
new file mode 100644
index 00000000..aed84e9e
--- /dev/null
+++ b/server/marlin/setup.py
@@ -0,0 +1,22 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+extra_compile_args = []
+
+setup(
+    name="marlin_kernels",
+    ext_modules=[
+        CUDAExtension(
+            name="marlin_kernels",
+            sources=[
+                "marlin_kernels/gptq_marlin.cu",
+                "marlin_kernels/gptq_marlin_repack.cu",
+                "marlin_kernels/marlin_cuda_kernel.cu",
+                "marlin_kernels/sparse/marlin_24_cuda_kernel.cu",
+                "marlin_kernels/ext.cpp",
+            ],
+            extra_compile_args=extra_compile_args,
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/server/poetry.lock b/server/poetry.lock
index fd4d427d..4984978a 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1,140 +1,130 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
-version = "0.20.3"
+version = "0.29.3"
 description = "Accelerate"
-optional = false
-python-versions = ">=3.7.0"
+optional = true
+python-versions = ">=3.8.0"
 files = [
-    {file = "accelerate-0.20.3-py3-none-any.whl", hash = "sha256:147183e7a2215f7bd45a7af3b986a963daa8a61fa58b0912b9473049e011ad15"},
-    {file = "accelerate-0.20.3.tar.gz", hash = "sha256:79a896978c20dac270083d42bf033f4c9a80dcdd6b946f1ca92d8d6d0f0f5ba9"},
+    {file = "accelerate-0.29.3-py3-none-any.whl", hash = "sha256:99d633d4b6126817c5e554487406748be95c8d1d1e659dd2fd60657e35f532dd"},
+    {file = "accelerate-0.29.3.tar.gz", hash = "sha256:1a5a845b06b24b41736b219b2b20fd021ca5dff4070a252445fd6de736e347ac"},
 ]
 
 [package.dependencies]
+huggingface-hub = "*"
 numpy = ">=1.17"
 packaging = ">=20.0"
 psutil = "*"
 pyyaml = "*"
-torch = ">=1.6.0"
+safetensors = ">=0.3.1"
+torch = ">=1.10.0"
 
 [package.extras]
-dev = ["black (>=23.1,<24.0)", "datasets", "deepspeed", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.0.241)", "scikit-learn", "scipy", "tqdm", "transformers", "urllib3 (<2.0.0)"]
-quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.0.241)", "urllib3 (<2.0.0)"]
+dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "deepspeed", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.2.1,<0.3.0)", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.2.1,<0.3.0)"]
 rich = ["rich"]
 sagemaker = ["sagemaker"]
-test-dev = ["datasets", "deepspeed", "evaluate", "scikit-learn", "scipy", "tqdm", "transformers"]
-test-prod = ["parameterized", "pytest", "pytest-subtests", "pytest-xdist"]
-test-trackers = ["comet-ml", "tensorboard", "wandb"]
-testing = ["datasets", "deepspeed", "evaluate", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "tqdm", "transformers"]
+test-dev = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
+test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
+test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
+testing = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
 
 [[package]]
 name = "aiohttp"
-version = "3.8.5"
+version = "3.9.5"
 description = "Async http client/server framework (asyncio)"
 optional = true
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
-    {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
-    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
-    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
-    {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
-    {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
-    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
-    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
-    {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
-    {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
-    {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
-    {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
-    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
-    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
-    {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
-    {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
-    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
-    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
-    {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
-    {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
+    {file = "aiohttp-3.9.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fcde4c397f673fdec23e6b05ebf8d4751314fa7c24f93334bf1f1364c1c69ac7"},
+    {file = "aiohttp-3.9.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d6b3f1fabe465e819aed2c421a6743d8debbde79b6a8600739300630a01bf2c"},
+    {file = "aiohttp-3.9.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6ae79c1bc12c34082d92bf9422764f799aee4746fd7a392db46b7fd357d4a17a"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d3ebb9e1316ec74277d19c5f482f98cc65a73ccd5430540d6d11682cd857430"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84dabd95154f43a2ea80deffec9cb44d2e301e38a0c9d331cc4aa0166fe28ae3"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8a02fbeca6f63cb1f0475c799679057fc9268b77075ab7cf3f1c600e81dd46b"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:714d4e5231fed4ba2762ed489b4aec07b2b9953cf4ee31e9871caac895a839c0"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7a6a8354f1b62e15d48e04350f13e726fa08b62c3d7b8401c0a1314f02e3558"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c413016880e03e69d166efb5a1a95d40f83d5a3a648d16486592c49ffb76d0db"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ff84aeb864e0fac81f676be9f4685f0527b660f1efdc40dcede3c251ef1e867f"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ad7f2919d7dac062f24d6f5fe95d401597fbb015a25771f85e692d043c9d7832"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:702e2c7c187c1a498a4e2b03155d52658fdd6fda882d3d7fbb891a5cf108bb10"},
+    {file = "aiohttp-3.9.5-cp310-cp310-win32.whl", hash = "sha256:67c3119f5ddc7261d47163ed86d760ddf0e625cd6246b4ed852e82159617b5fb"},
+    {file = "aiohttp-3.9.5-cp310-cp310-win_amd64.whl", hash = "sha256:471f0ef53ccedec9995287f02caf0c068732f026455f07db3f01a46e49d76bbb"},
+    {file = "aiohttp-3.9.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e0ae53e33ee7476dd3d1132f932eeb39bf6125083820049d06edcdca4381f342"},
+    {file = "aiohttp-3.9.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c088c4d70d21f8ca5c0b8b5403fe84a7bc8e024161febdd4ef04575ef35d474d"},
+    {file = "aiohttp-3.9.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:639d0042b7670222f33b0028de6b4e2fad6451462ce7df2af8aee37dcac55424"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f26383adb94da5e7fb388d441bf09c61e5e35f455a3217bfd790c6b6bc64b2ee"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66331d00fb28dc90aa606d9a54304af76b335ae204d1836f65797d6fe27f1ca2"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ff550491f5492ab5ed3533e76b8567f4b37bd2995e780a1f46bca2024223233"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f22eb3a6c1080d862befa0a89c380b4dafce29dc6cd56083f630073d102eb595"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a81b1143d42b66ffc40a441379387076243ef7b51019204fd3ec36b9f69e77d6"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f64fd07515dad67f24b6ea4a66ae2876c01031de91c93075b8093f07c0a2d93d"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:93e22add827447d2e26d67c9ac0161756007f152fdc5210277d00a85f6c92323"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:55b39c8684a46e56ef8c8d24faf02de4a2b2ac60d26cee93bc595651ff545de9"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4715a9b778f4293b9f8ae7a0a7cef9829f02ff8d6277a39d7f40565c737d3771"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afc52b8d969eff14e069a710057d15ab9ac17cd4b6753042c407dcea0e40bf75"},
+    {file = "aiohttp-3.9.5-cp311-cp311-win32.whl", hash = "sha256:b3df71da99c98534be076196791adca8819761f0bf6e08e07fd7da25127150d6"},
+    {file = "aiohttp-3.9.5-cp311-cp311-win_amd64.whl", hash = "sha256:88e311d98cc0bf45b62fc46c66753a83445f5ab20038bcc1b8a1cc05666f428a"},
+    {file = "aiohttp-3.9.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c7a4b7a6cf5b6eb11e109a9755fd4fda7d57395f8c575e166d363b9fc3ec4678"},
+    {file = "aiohttp-3.9.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0a158704edf0abcac8ac371fbb54044f3270bdbc93e254a82b6c82be1ef08f3c"},
+    {file = "aiohttp-3.9.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d153f652a687a8e95ad367a86a61e8d53d528b0530ef382ec5aaf533140ed00f"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82a6a97d9771cb48ae16979c3a3a9a18b600a8505b1115cfe354dfb2054468b4"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60cdbd56f4cad9f69c35eaac0fbbdf1f77b0ff9456cebd4902f3dd1cf096464c"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8676e8fd73141ded15ea586de0b7cda1542960a7b9ad89b2b06428e97125d4fa"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da00da442a0e31f1c69d26d224e1efd3a1ca5bcbf210978a2ca7426dfcae9f58"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18f634d540dd099c262e9f887c8bbacc959847cfe5da7a0e2e1cf3f14dbf2daf"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:320e8618eda64e19d11bdb3bd04ccc0a816c17eaecb7e4945d01deee2a22f95f"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:2faa61a904b83142747fc6a6d7ad8fccff898c849123030f8e75d5d967fd4a81"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:8c64a6dc3fe5db7b1b4d2b5cb84c4f677768bdc340611eca673afb7cf416ef5a"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:393c7aba2b55559ef7ab791c94b44f7482a07bf7640d17b341b79081f5e5cd1a"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c671dc117c2c21a1ca10c116cfcd6e3e44da7fcde37bf83b2be485ab377b25da"},
+    {file = "aiohttp-3.9.5-cp312-cp312-win32.whl", hash = "sha256:5a7ee16aab26e76add4afc45e8f8206c95d1d75540f1039b84a03c3b3800dd59"},
+    {file = "aiohttp-3.9.5-cp312-cp312-win_amd64.whl", hash = "sha256:5ca51eadbd67045396bc92a4345d1790b7301c14d1848feaac1d6a6c9289e888"},
+    {file = "aiohttp-3.9.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:694d828b5c41255e54bc2dddb51a9f5150b4eefa9886e38b52605a05d96566e8"},
+    {file = "aiohttp-3.9.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0605cc2c0088fcaae79f01c913a38611ad09ba68ff482402d3410bf59039bfb8"},
+    {file = "aiohttp-3.9.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4558e5012ee03d2638c681e156461d37b7a113fe13970d438d95d10173d25f78"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dbc053ac75ccc63dc3a3cc547b98c7258ec35a215a92bd9f983e0aac95d3d5b"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4109adee842b90671f1b689901b948f347325045c15f46b39797ae1bf17019de"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6ea1a5b409a85477fd8e5ee6ad8f0e40bf2844c270955e09360418cfd09abac"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3c2890ca8c59ee683fd09adf32321a40fe1cf164e3387799efb2acebf090c11"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3916c8692dbd9d55c523374a3b8213e628424d19116ac4308e434dbf6d95bbdd"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d1964eb7617907c792ca00b341b5ec3e01ae8c280825deadbbd678447b127e1"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d5ab8e1f6bee051a4bf6195e38a5c13e5e161cb7bad83d8854524798bd9fcd6e"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:52c27110f3862a1afbcb2af4281fc9fdc40327fa286c4625dfee247c3ba90156"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:7f64cbd44443e80094309875d4f9c71d0401e966d191c3d469cde4642bc2e031"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8b4f72fbb66279624bfe83fd5eb6aea0022dad8eec62b71e7bf63ee1caadeafe"},
+    {file = "aiohttp-3.9.5-cp38-cp38-win32.whl", hash = "sha256:6380c039ec52866c06d69b5c7aad5478b24ed11696f0e72f6b807cfb261453da"},
+    {file = "aiohttp-3.9.5-cp38-cp38-win_amd64.whl", hash = "sha256:da22dab31d7180f8c3ac7c7635f3bcd53808f374f6aa333fe0b0b9e14b01f91a"},
+    {file = "aiohttp-3.9.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1732102949ff6087589408d76cd6dea656b93c896b011ecafff418c9661dc4ed"},
+    {file = "aiohttp-3.9.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c6021d296318cb6f9414b48e6a439a7f5d1f665464da507e8ff640848ee2a58a"},
+    {file = "aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:239f975589a944eeb1bad26b8b140a59a3a320067fb3cd10b75c3092405a1372"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b7b30258348082826d274504fbc7c849959f1989d86c29bc355107accec6cfb"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2adf5c87ff6d8b277814a28a535b59e20bfea40a101db6b3bdca7e9926bc24"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a3d838441bebcf5cf442700e3963f58b5c33f015341f9ea86dcd7d503c07e2"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e3a1ae66e3d0c17cf65c08968a5ee3180c5a95920ec2731f53343fac9bad106"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c69e77370cce2d6df5d12b4e12bdcca60c47ba13d1cbbc8645dd005a20b738b"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0cbf56238f4bbf49dab8c2dc2e6b1b68502b1e88d335bea59b3f5b9f4c001475"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d1469f228cd9ffddd396d9948b8c9cd8022b6d1bf1e40c6f25b0fb90b4f893ed"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:45731330e754f5811c314901cebdf19dd776a44b31927fa4b4dbecab9e457b0c"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:3fcb4046d2904378e3aeea1df51f697b0467f2aac55d232c87ba162709478c46"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8cf142aa6c1a751fcb364158fd710b8a9be874b81889c2bd13aa8893197455e2"},
+    {file = "aiohttp-3.9.5-cp39-cp39-win32.whl", hash = "sha256:7b179eea70833c8dee51ec42f3b4097bd6370892fa93f510f76762105568cf09"},
+    {file = "aiohttp-3.9.5-cp39-cp39-win_amd64.whl", hash = "sha256:38d80498e2e169bc61418ff36170e0aad0cd268da8b38a17c4cf29d254a8b3f1"},
+    {file = "aiohttp-3.9.5.tar.gz", hash = "sha256:edea7d15772ceeb29db4aff55e482d4bcfb6ae160ce144f2682de02f6d693551"},
 ]
 
 [package.dependencies]
 aiosignal = ">=1.1.2"
-async-timeout = ">=4.0.0a3,<5.0"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
-charset-normalizer = ">=2.0,<4.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli", "aiodns", "cchardet"]
+speedups = ["Brotli", "aiodns", "brotlicffi"]
 
 [[package]]
 name = "aiosignal"
@@ -150,6 +140,17 @@ files = [
 [package.dependencies]
 frozenlist = ">=1.1.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
+    {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
+]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -163,137 +164,150 @@ files = [
 
 [[package]]
 name = "attrs"
-version = "23.1.0"
+version = "23.2.0"
 description = "Classes Without Boilerplate"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
-    {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
+    {file = "attrs-23.2.0-py3-none-any.whl", hash = "sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1"},
+    {file = "attrs-23.2.0.tar.gz", hash = "sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30"},
 ]
 
 [package.extras]
 cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
-dev = ["attrs[docs,tests]", "pre-commit"]
+dev = ["attrs[tests]", "pre-commit"]
 docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
 tests = ["attrs[tests-no-zope]", "zope-interface"]
-tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-
-[[package]]
-name = "backoff"
-version = "2.2.1"
-description = "Function decoration for backoff and retry"
-optional = false
-python-versions = ">=3.7,<4.0"
-files = [
-    {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"},
-    {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
-]
+tests-mypy = ["mypy (>=1.6)", "pytest-mypy-plugins"]
+tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "pytest (>=4.3.0)", "pytest-xdist[psutil]"]
 
 [[package]]
 name = "bitsandbytes"
-version = "0.41.1"
+version = "0.43.1"
 description = "k-bit optimizers and matrix multiplication routines."
 optional = true
 python-versions = "*"
 files = [
-    {file = "bitsandbytes-0.41.1-py3-none-any.whl", hash = "sha256:b25228c27636367f222232ed4d1e1502eedd2064be215633734fb8ea0c1c65f4"},
-    {file = "bitsandbytes-0.41.1.tar.gz", hash = "sha256:b3f8e7e1e5f88d4813d10ebd4072034ba6a18eca7f0e255376f8320e5499032c"},
+    {file = "bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:a81c826d576d6d691c7b4a7491c8fdc0f37f769795d6ca2e54afa605d2c260a3"},
+    {file = "bitsandbytes-0.43.1-py3-none-win_amd64.whl", hash = "sha256:52c1c7189a6ca006555a9663e544e75f40520a97a26e075411f9f9aca0771fcd"},
 ]
 
+[package.dependencies]
+numpy = "*"
+torch = "*"
+
+[package.extras]
+benchmark = ["matplotlib", "pandas"]
+test = ["scipy"]
+
 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2024.6.2"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"},
+    {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"},
 ]
 
 [[package]]
 name = "charset-normalizer"
-version = "3.2.0"
+version = "3.3.2"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"},
-    {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"},
-    {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"},
-    {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"},
-    {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"},
-    {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"},
-    {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
+    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
+    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
+    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
+    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
+    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
+    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
+    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
+    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
 ]
 
 [[package]]
@@ -310,6 +324,17 @@ files = [
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "cloudpickle"
+version = "3.0.0"
+description = "Pickler class to extend the standard pickle.Pickler functionality"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"},
+    {file = "cloudpickle-3.0.0.tar.gz", hash = "sha256:996d9a482c6fb4f33c1a35335cf8afd065d2a56e973270364840712d9131a882"},
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -394,6 +419,17 @@ files = [
 [package.extras]
 graph = ["objgraph (>=1.7.2)"]
 
+[[package]]
+name = "diskcache"
+version = "5.6.3"
+description = "Disk Cache -- Disk and file backed persistent cache."
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"},
+    {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"},
+]
+
 [[package]]
 name = "einops"
 version = "0.6.1"
@@ -407,13 +443,13 @@ files = [
 
 [[package]]
 name = "exceptiongroup"
-version = "1.1.3"
+version = "1.2.1"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"},
-    {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"},
+    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
+    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
 ]
 
 [package.extras]
@@ -421,113 +457,127 @@ test = ["pytest (>=6)"]
 
 [[package]]
 name = "filelock"
-version = "3.12.3"
+version = "3.14.0"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"},
-    {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"},
+    {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"},
+    {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"},
 ]
 
-[package.dependencies]
-typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""}
-
 [package.extras]
-docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"]
+docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"]
+typing = ["typing-extensions (>=4.8)"]
 
 [[package]]
 name = "frozenlist"
-version = "1.4.0"
+version = "1.4.1"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"},
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"},
-    {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"},
-    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"},
-    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"},
-    {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"},
-    {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"},
-    {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"},
-    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"},
-    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"},
-    {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"},
-    {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"},
-    {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"},
-    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"},
-    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"},
-    {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"},
-    {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"},
-    {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"},
-    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"},
-    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"},
-    {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"},
-    {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"},
-    {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
+    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
+    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
 ]
 
 [[package]]
 name = "fsspec"
-version = "2023.6.0"
+version = "2024.6.0"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "fsspec-2023.6.0-py3-none-any.whl", hash = "sha256:1cbad1faef3e391fba6dc005ae9b5bdcbf43005c9167ce78c915549c352c869a"},
-    {file = "fsspec-2023.6.0.tar.gz", hash = "sha256:d0b2f935446169753e7a5c5c55681c54ea91996cc67be93c39a154fb3a2742af"},
+    {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"},
+    {file = "fsspec-2024.6.0.tar.gz", hash = "sha256:f579960a56e6d8038a9efc8f9c77279ec12e6299aa86b0769a7e9c46b94527c2"},
 ]
 
 [package.dependencies]
 aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""}
-requests = {version = "*", optional = true, markers = "extra == \"http\""}
 
 [package.extras]
 abfs = ["adlfs"]
 adl = ["adlfs"]
 arrow = ["pyarrow (>=1)"]
 dask = ["dask", "distributed"]
-devel = ["pytest", "pytest-cov"]
+dev = ["pre-commit", "ruff"]
+doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
 dropbox = ["dropbox", "dropboxdrivefs", "requests"]
 full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
 fuse = ["fusepy"]
@@ -537,41 +587,44 @@ github = ["requests"]
 gs = ["gcsfs"]
 gui = ["panel"]
 hdfs = ["pyarrow (>=1)"]
-http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
 libarchive = ["libarchive-c"]
 oci = ["ocifs"]
 s3 = ["s3fs"]
 sftp = ["paramiko"]
 smb = ["smbprotocol"]
 ssh = ["paramiko"]
+test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
+test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
+test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]
 
 [[package]]
 name = "googleapis-common-protos"
-version = "1.60.0"
+version = "1.63.1"
 description = "Common protobufs used in Google APIs"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "googleapis-common-protos-1.60.0.tar.gz", hash = "sha256:e73ebb404098db405ba95d1e1ae0aa91c3e15a71da031a2eeb6b2e23e7bc3708"},
-    {file = "googleapis_common_protos-1.60.0-py2.py3-none-any.whl", hash = "sha256:69f9bbcc6acde92cab2db95ce30a70bd2b81d20b12eff3f1aabaffcbe8a93918"},
+    {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"},
+    {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"},
 ]
 
 [package.dependencies]
-protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0"
+protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
 
 [package.extras]
 grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
 
 [[package]]
 name = "grpc-interceptor"
-version = "0.15.3"
+version = "0.15.4"
 description = "Simplifies gRPC interceptors"
 optional = false
 python-versions = ">=3.7,<4.0"
 files = [
-    {file = "grpc-interceptor-0.15.3.tar.gz", hash = "sha256:33592cb9d8c00fceed5755c71029f75aef55b273496dbced06f1d48f2571fcc3"},
-    {file = "grpc_interceptor-0.15.3-py3-none-any.whl", hash = "sha256:96be2043b7e49f9deb444f18b61c373ea28d22d81c90cd3b82127a4744eb9247"},
+    {file = "grpc-interceptor-0.15.4.tar.gz", hash = "sha256:1f45c0bcb58b6f332f37c637632247c9b02bc6af0fdceb7ba7ce8d2ebbfb0926"},
+    {file = "grpc_interceptor-0.15.4-py3-none-any.whl", hash = "sha256:0035f33228693ed3767ee49d937bac424318db173fef4d2d0170b3215f254d9d"},
 ]
 
 [package.dependencies]
@@ -582,193 +635,247 @@ testing = ["protobuf (>=4.21.9)"]
 
 [[package]]
 name = "grpcio"
-version = "1.57.0"
+version = "1.64.1"
 description = "HTTP/2-based RPC framework"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "grpcio-1.57.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:092fa155b945015754bdf988be47793c377b52b88d546e45c6a9f9579ac7f7b6"},
-    {file = "grpcio-1.57.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f7349786da979a94690cc5c2b804cab4e8774a3cf59be40d037c4342c906649"},
-    {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:82640e57fb86ea1d71ea9ab54f7e942502cf98a429a200b2e743d8672171734f"},
-    {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40b72effd4c789de94ce1be2b5f88d7b9b5f7379fe9645f198854112a6567d9a"},
-    {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f708a6a17868ad8bf586598bee69abded4996b18adf26fd2d91191383b79019"},
-    {file = "grpcio-1.57.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:60fe15288a0a65d5c1cb5b4a62b1850d07336e3ba728257a810317be14f0c527"},
-    {file = "grpcio-1.57.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6907b1cf8bb29b058081d2aad677b15757a44ef2d4d8d9130271d2ad5e33efca"},
-    {file = "grpcio-1.57.0-cp310-cp310-win32.whl", hash = "sha256:57b183e8b252825c4dd29114d6c13559be95387aafc10a7be645462a0fc98bbb"},
-    {file = "grpcio-1.57.0-cp310-cp310-win_amd64.whl", hash = "sha256:7b400807fa749a9eb286e2cd893e501b110b4d356a218426cb9c825a0474ca56"},
-    {file = "grpcio-1.57.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:c6ebecfb7a31385393203eb04ed8b6a08f5002f53df3d59e5e795edb80999652"},
-    {file = "grpcio-1.57.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:00258cbe3f5188629828363ae8ff78477ce976a6f63fb2bb5e90088396faa82e"},
-    {file = "grpcio-1.57.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:23e7d8849a0e58b806253fd206ac105b328171e01b8f18c7d5922274958cc87e"},
-    {file = "grpcio-1.57.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5371bcd861e679d63b8274f73ac281751d34bd54eccdbfcd6aa00e692a82cd7b"},
-    {file = "grpcio-1.57.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aed90d93b731929e742967e236f842a4a2174dc5db077c8f9ad2c5996f89f63e"},
-    {file = "grpcio-1.57.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fe752639919aad9ffb0dee0d87f29a6467d1ef764f13c4644d212a9a853a078d"},
-    {file = "grpcio-1.57.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fada6b07ec4f0befe05218181f4b85176f11d531911b64c715d1875c4736d73a"},
-    {file = "grpcio-1.57.0-cp311-cp311-win32.whl", hash = "sha256:bb396952cfa7ad2f01061fbc7dc1ad91dd9d69243bcb8110cf4e36924785a0fe"},
-    {file = "grpcio-1.57.0-cp311-cp311-win_amd64.whl", hash = "sha256:e503cb45ed12b924b5b988ba9576dc9949b2f5283b8e33b21dcb6be74a7c58d0"},
-    {file = "grpcio-1.57.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:fd173b4cf02b20f60860dc2ffe30115c18972d7d6d2d69df97ac38dee03be5bf"},
-    {file = "grpcio-1.57.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:d7f8df114d6b4cf5a916b98389aeaf1e3132035420a88beea4e3d977e5f267a5"},
-    {file = "grpcio-1.57.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:76c44efa4ede1f42a9d5b2fed1fe9377e73a109bef8675fb0728eb80b0b8e8f2"},
-    {file = "grpcio-1.57.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4faea2cfdf762a664ab90589b66f416274887641ae17817de510b8178356bf73"},
-    {file = "grpcio-1.57.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c60b83c43faeb6d0a9831f0351d7787a0753f5087cc6fa218d78fdf38e5acef0"},
-    {file = "grpcio-1.57.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b363bbb5253e5f9c23d8a0a034dfdf1b7c9e7f12e602fc788c435171e96daccc"},
-    {file = "grpcio-1.57.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:f1fb0fd4a1e9b11ac21c30c169d169ef434c6e9344ee0ab27cfa6f605f6387b2"},
-    {file = "grpcio-1.57.0-cp37-cp37m-win_amd64.whl", hash = "sha256:34950353539e7d93f61c6796a007c705d663f3be41166358e3d88c45760c7d98"},
-    {file = "grpcio-1.57.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:871f9999e0211f9551f368612460442a5436d9444606184652117d6a688c9f51"},
-    {file = "grpcio-1.57.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:a8a8e560e8dbbdf29288872e91efd22af71e88b0e5736b0daf7773c1fecd99f0"},
-    {file = "grpcio-1.57.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:2313b124e475aa9017a9844bdc5eafb2d5abdda9d456af16fc4535408c7d6da6"},
-    {file = "grpcio-1.57.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b4098b6b638d9e0ca839a81656a2fd4bc26c9486ea707e8b1437d6f9d61c3941"},
-    {file = "grpcio-1.57.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e5b58e32ae14658085c16986d11e99abd002ddbf51c8daae8a0671fffb3467f"},
-    {file = "grpcio-1.57.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0f80bf37f09e1caba6a8063e56e2b87fa335add314cf2b78ebf7cb45aa7e3d06"},
-    {file = "grpcio-1.57.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5b7a4ce8f862fe32b2a10b57752cf3169f5fe2915acfe7e6a1e155db3da99e79"},
-    {file = "grpcio-1.57.0-cp38-cp38-win32.whl", hash = "sha256:9338bacf172e942e62e5889b6364e56657fbf8ac68062e8b25c48843e7b202bb"},
-    {file = "grpcio-1.57.0-cp38-cp38-win_amd64.whl", hash = "sha256:e1cb52fa2d67d7f7fab310b600f22ce1ff04d562d46e9e0ac3e3403c2bb4cc16"},
-    {file = "grpcio-1.57.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:fee387d2fab144e8a34e0e9c5ca0f45c9376b99de45628265cfa9886b1dbe62b"},
-    {file = "grpcio-1.57.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:b53333627283e7241fcc217323f225c37783b5f0472316edcaa4479a213abfa6"},
-    {file = "grpcio-1.57.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f19ac6ac0a256cf77d3cc926ef0b4e64a9725cc612f97228cd5dc4bd9dbab03b"},
-    {file = "grpcio-1.57.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fdf04e402f12e1de8074458549337febb3b45f21076cc02ef4ff786aff687e"},
-    {file = "grpcio-1.57.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5613a2fecc82f95d6c51d15b9a72705553aa0d7c932fad7aed7afb51dc982ee5"},
-    {file = "grpcio-1.57.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b670c2faa92124b7397b42303e4d8eb64a4cd0b7a77e35a9e865a55d61c57ef9"},
-    {file = "grpcio-1.57.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a635589201b18510ff988161b7b573f50c6a48fae9cb567657920ca82022b37"},
-    {file = "grpcio-1.57.0-cp39-cp39-win32.whl", hash = "sha256:d78d8b86fcdfa1e4c21f8896614b6cc7ee01a2a758ec0c4382d662f2a62cf766"},
-    {file = "grpcio-1.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:20ec6fc4ad47d1b6e12deec5045ec3cd5402d9a1597f738263e98f490fe07056"},
-    {file = "grpcio-1.57.0.tar.gz", hash = "sha256:4b089f7ad1eb00a104078bab8015b0ed0ebcb3b589e527ab009c53893fd4e613"},
+    {file = "grpcio-1.64.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:55697ecec192bc3f2f3cc13a295ab670f51de29884ca9ae6cd6247df55df2502"},
+    {file = "grpcio-1.64.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:3b64ae304c175671efdaa7ec9ae2cc36996b681eb63ca39c464958396697daff"},
+    {file = "grpcio-1.64.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:bac71b4b28bc9af61efcdc7630b166440bbfbaa80940c9a697271b5e1dabbc61"},
+    {file = "grpcio-1.64.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c024ffc22d6dc59000faf8ad781696d81e8e38f4078cb0f2630b4a3cf231a90"},
+    {file = "grpcio-1.64.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7cd5c1325f6808b8ae31657d281aadb2a51ac11ab081ae335f4f7fc44c1721d"},
+    {file = "grpcio-1.64.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0a2813093ddb27418a4c99f9b1c223fab0b053157176a64cc9db0f4557b69bd9"},
+    {file = "grpcio-1.64.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2981c7365a9353f9b5c864595c510c983251b1ab403e05b1ccc70a3d9541a73b"},
+    {file = "grpcio-1.64.1-cp310-cp310-win32.whl", hash = "sha256:1262402af5a511c245c3ae918167eca57342c72320dffae5d9b51840c4b2f86d"},
+    {file = "grpcio-1.64.1-cp310-cp310-win_amd64.whl", hash = "sha256:19264fc964576ddb065368cae953f8d0514ecc6cb3da8903766d9fb9d4554c33"},
+    {file = "grpcio-1.64.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:58b1041e7c870bb30ee41d3090cbd6f0851f30ae4eb68228955d973d3efa2e61"},
+    {file = "grpcio-1.64.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bbc5b1d78a7822b0a84c6f8917faa986c1a744e65d762ef6d8be9d75677af2ca"},
+    {file = "grpcio-1.64.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:5841dd1f284bd1b3d8a6eca3a7f062b06f1eec09b184397e1d1d43447e89a7ae"},
+    {file = "grpcio-1.64.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8caee47e970b92b3dd948371230fcceb80d3f2277b3bf7fbd7c0564e7d39068e"},
+    {file = "grpcio-1.64.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73819689c169417a4f978e562d24f2def2be75739c4bed1992435d007819da1b"},
+    {file = "grpcio-1.64.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6503b64c8b2dfad299749cad1b595c650c91e5b2c8a1b775380fcf8d2cbba1e9"},
+    {file = "grpcio-1.64.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1de403fc1305fd96cfa75e83be3dee8538f2413a6b1685b8452301c7ba33c294"},
+    {file = "grpcio-1.64.1-cp311-cp311-win32.whl", hash = "sha256:d4d29cc612e1332237877dfa7fe687157973aab1d63bd0f84cf06692f04c0367"},
+    {file = "grpcio-1.64.1-cp311-cp311-win_amd64.whl", hash = "sha256:5e56462b05a6f860b72f0fa50dca06d5b26543a4e88d0396259a07dc30f4e5aa"},
+    {file = "grpcio-1.64.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:4657d24c8063e6095f850b68f2d1ba3b39f2b287a38242dcabc166453e950c59"},
+    {file = "grpcio-1.64.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:62b4e6eb7bf901719fce0ca83e3ed474ae5022bb3827b0a501e056458c51c0a1"},
+    {file = "grpcio-1.64.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:ee73a2f5ca4ba44fa33b4d7d2c71e2c8a9e9f78d53f6507ad68e7d2ad5f64a22"},
+    {file = "grpcio-1.64.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:198908f9b22e2672a998870355e226a725aeab327ac4e6ff3a1399792ece4762"},
+    {file = "grpcio-1.64.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b9d0acaa8d835a6566c640f48b50054f422d03e77e49716d4c4e8e279665a1"},
+    {file = "grpcio-1.64.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5e42634a989c3aa6049f132266faf6b949ec2a6f7d302dbb5c15395b77d757eb"},
+    {file = "grpcio-1.64.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b1a82e0b9b3022799c336e1fc0f6210adc019ae84efb7321d668129d28ee1efb"},
+    {file = "grpcio-1.64.1-cp312-cp312-win32.whl", hash = "sha256:55260032b95c49bee69a423c2f5365baa9369d2f7d233e933564d8a47b893027"},
+    {file = "grpcio-1.64.1-cp312-cp312-win_amd64.whl", hash = "sha256:c1a786ac592b47573a5bb7e35665c08064a5d77ab88a076eec11f8ae86b3e3f6"},
+    {file = "grpcio-1.64.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:a011ac6c03cfe162ff2b727bcb530567826cec85eb8d4ad2bfb4bd023287a52d"},
+    {file = "grpcio-1.64.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4d6dab6124225496010bd22690f2d9bd35c7cbb267b3f14e7a3eb05c911325d4"},
+    {file = "grpcio-1.64.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:a5e771d0252e871ce194d0fdcafd13971f1aae0ddacc5f25615030d5df55c3a2"},
+    {file = "grpcio-1.64.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3c1b90ab93fed424e454e93c0ed0b9d552bdf1b0929712b094f5ecfe7a23ad"},
+    {file = "grpcio-1.64.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20405cb8b13fd779135df23fabadc53b86522d0f1cba8cca0e87968587f50650"},
+    {file = "grpcio-1.64.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0cc79c982ccb2feec8aad0e8fb0d168bcbca85bc77b080d0d3c5f2f15c24ea8f"},
+    {file = "grpcio-1.64.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a3a035c37ce7565b8f4f35ff683a4db34d24e53dc487e47438e434eb3f701b2a"},
+    {file = "grpcio-1.64.1-cp38-cp38-win32.whl", hash = "sha256:1257b76748612aca0f89beec7fa0615727fd6f2a1ad580a9638816a4b2eb18fd"},
+    {file = "grpcio-1.64.1-cp38-cp38-win_amd64.whl", hash = "sha256:0a12ddb1678ebc6a84ec6b0487feac020ee2b1659cbe69b80f06dbffdb249122"},
+    {file = "grpcio-1.64.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:75dbbf415026d2862192fe1b28d71f209e2fd87079d98470db90bebe57b33179"},
+    {file = "grpcio-1.64.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e3d9f8d1221baa0ced7ec7322a981e28deb23749c76eeeb3d33e18b72935ab62"},
+    {file = "grpcio-1.64.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5f8b75f64d5d324c565b263c67dbe4f0af595635bbdd93bb1a88189fc62ed2e5"},
+    {file = "grpcio-1.64.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c84ad903d0d94311a2b7eea608da163dace97c5fe9412ea311e72c3684925602"},
+    {file = "grpcio-1.64.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:940e3ec884520155f68a3b712d045e077d61c520a195d1a5932c531f11883489"},
+    {file = "grpcio-1.64.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f10193c69fc9d3d726e83bbf0f3d316f1847c3071c8c93d8090cf5f326b14309"},
+    {file = "grpcio-1.64.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac15b6c2c80a4d1338b04d42a02d376a53395ddf0ec9ab157cbaf44191f3ffdd"},
+    {file = "grpcio-1.64.1-cp39-cp39-win32.whl", hash = "sha256:03b43d0ccf99c557ec671c7dede64f023c7da9bb632ac65dbc57f166e4970040"},
+    {file = "grpcio-1.64.1-cp39-cp39-win_amd64.whl", hash = "sha256:ed6091fa0adcc7e4ff944090cf203a52da35c37a130efa564ded02b7aff63bcd"},
+    {file = "grpcio-1.64.1.tar.gz", hash = "sha256:8d51dd1c59d5fa0f34266b80a3805ec29a1f26425c2a54736133f6d87fc4968a"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.57.0)"]
+protobuf = ["grpcio-tools (>=1.64.1)"]
 
 [[package]]
 name = "grpcio-reflection"
-version = "1.57.0"
+version = "1.62.2"
 description = "Standard Protobuf Reflection Service for gRPC"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "grpcio-reflection-1.57.0.tar.gz", hash = "sha256:8f63a18729cba995a172f8325235f5094cb066febec75f9a3b1b2e28328aa166"},
-    {file = "grpcio_reflection-1.57.0-py3-none-any.whl", hash = "sha256:d7deb8587f9d0095fb5d367c2aa5ce1380e3f23b0f8bca6c00bc404c5429cb6a"},
+    {file = "grpcio-reflection-1.62.2.tar.gz", hash = "sha256:2dd44806d68d0006636529bda573012b19a42281478c2d051cdaaebb91e2516c"},
+    {file = "grpcio_reflection-1.62.2-py3-none-any.whl", hash = "sha256:68e8dff3617a9afaf7c462c688f7ca62b55323f497c662abf9965f2953508885"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.57.0"
+grpcio = ">=1.62.2"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-status"
-version = "1.57.0"
+version = "1.62.2"
 description = "Status proto mapping for gRPC"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "grpcio-status-1.57.0.tar.gz", hash = "sha256:b098da99df1eebe58337f8f78e50df990273ccacc1226fddeb47c590e3df9e02"},
-    {file = "grpcio_status-1.57.0-py3-none-any.whl", hash = "sha256:15d6af055914ebbc4ed17e55ebfb8e6bb17a45a57fea32e6af19978fb7844690"},
+    {file = "grpcio-status-1.62.2.tar.gz", hash = "sha256:62e1bfcb02025a1cd73732a2d33672d3e9d0df4d21c12c51e0bbcaf09bab742a"},
+    {file = "grpcio_status-1.62.2-py3-none-any.whl", hash = "sha256:206ddf0eb36bc99b033f03b2c8e95d319f0044defae9b41ae21408e7e0cda48f"},
 ]
 
 [package.dependencies]
 googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.57.0"
+grpcio = ">=1.62.2"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-tools"
-version = "1.57.0"
+version = "1.62.2"
 description = "Protobuf code generator for gRPC"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "grpcio-tools-1.57.0.tar.gz", hash = "sha256:2f16130d869ce27ecd623194547b649dd657333ec7e8644cc571c645781a9b85"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:4fb8a8468031f858381a576078924af364a08833d8f8f3237018252c4573a802"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:35bf0dad8a3562043345236c26d0053a856fb06c04d7da652f2ded914e508ae7"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:ec9aab2fb6783c7fc54bc28f58eb75f1ca77594e6b0fd5e5e7a8114a95169fe0"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0cf5fc0a1c23f8ea34b408b72fb0e90eec0f404ad4dba98e8f6da3c9ce34e2ed"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26e69d08a515554e0cfe1ec4d31568836f4b17f0ff82294f957f629388629eb9"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c39a3656576b6fdaaf28abe0467f7a7231df4230c1bee132322dbc3209419e7f"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f64f8ab22d27d4a5693310748d35a696061c3b5c7b8c4fb4ab3b4bc1068b6b56"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-win32.whl", hash = "sha256:d2a134756f4db34759a5cc7f7e43f7eb87540b68d1cca62925593c6fb93924f7"},
-    {file = "grpcio_tools-1.57.0-cp310-cp310-win_amd64.whl", hash = "sha256:9a3d60fb8d46ede26c1907c146561b3a9caa20a7aff961bc661ef8226f85a2e9"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:aac98ecad8f7bd4301855669d42a5d97ef7bb34bec2b1e74c7a0641d47e313cf"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:cdd020cb68b51462983b7c2dfbc3eb6ede032b8bf438d4554df0c3f08ce35c76"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:f54081b08419a39221cd646363b5708857c696b3ad4784f1dcf310891e33a5f7"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ed85a0291fff45b67f2557fe7f117d3bc7af8b54b8619d27bf374b5c8b7e3ca2"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e868cd6feb3ef07d4b35be104fe1fd0657db05259ff8f8ec5e08f4f89ca1191d"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:dfb6f6120587b8e228a3cae5ee4985b5bdc18501bad05c49df61965dfc9d70a9"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a7ad7f328e28fc97c356d0f10fb10d8b5151bb65aa7cf14bf8084513f0b7306"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-win32.whl", hash = "sha256:9867f2817b1a0c93c523f89ac6c9d8625548af4620a7ce438bf5a76e23327284"},
-    {file = "grpcio_tools-1.57.0-cp311-cp311-win_amd64.whl", hash = "sha256:1f9e917a9f18087f6c14b4d4508fb94fca5c2f96852363a89232fb9b2124ac1f"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-linux_armv7l.whl", hash = "sha256:9f2aefa8a37bd2c4db1a3f1aca11377e2766214520fb70e67071f4ff8d8b0fa5"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:850cbda0ec5d24c39e7215ede410276040692ca45d105fbbeada407fa03f0ac0"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:6fa52972c9647876ea35f6dc2b51002a74ed900ec7894586cbb2fe76f64f99de"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0eea89d7542719594e50e2283f51a072978b953e8b3e9fd7c59a2c762d4c1"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3da5240211252fc70a6451fe00c143e2ab2f7bfc2445695ad2ed056b8e48d96"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a0256f8786ac9e4db618a1aa492bb3472569a0946fd3ee862ffe23196323da55"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c026bdf5c1366ce88b7bbe2d8207374d675afd3fd911f60752103de3da4a41d2"},
-    {file = "grpcio_tools-1.57.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9053c2f655589545be08b9d6a673e92970173a4bf11a4b9f18cd6e9af626b587"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:81ec4dbb696e095057b2528d11a8da04be6bbe2b967fa07d4ea9ba6354338cbf"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:495e2946406963e0b9f063f76d5af0f2a19517dac2b367b5b044432ac9194296"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:7b46fc6aa8eb7edd18cafcd21fd98703cb6c09e46b507de335fca7f0161dfccb"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb81ff861692111fa81bd85f64584e624cb4013bd66fbce8a209b8893f5ce398"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a42dc220eb5305f470855c9284f4c8e85ae59d6d742cd07946b0cbe5e9ca186"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:90d10d9038ba46a595a223a34f136c9230e3d6d7abc2433dbf0e1c31939d3a8b"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5bc3e6d338aefb052e19cedabe00452be46d0c10a4ed29ee77abb00402e438fe"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-win32.whl", hash = "sha256:34b36217b17b5bea674a414229913e1fd80ede328be51e1b531fcc62abd393b0"},
-    {file = "grpcio_tools-1.57.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbde4004a0688400036342ff73e3706e8940483e2871547b1354d59e93a38277"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:784574709b9690dc28696617ea69352e2132352fdfc9bc89afa8e39f99ae538e"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:85ac4e62eb44428cde025fd9ab7554002315fc7880f791c553fc5a0015cc9931"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:dc771d4db5701f280957bbcee91745e0686d00ed1c6aa7e05ba30a58b02d70a1"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3ac06703c412f8167a9062eaf6099409967e33bf98fa5b02be4b4689b6bdf39"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02d78c034109f46032c7217260066d49d41e6bcaf588fa28fa40fe2f83445347"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2db25f15ed44327f2e02d0c4fe741ac966f9500e407047d8a7c7fccf2df65616"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2b417c97936d94874a3ce7ed8deab910f2233e3612134507cfee4af8735c38a6"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-win32.whl", hash = "sha256:f717cce5093e6b6049d9ea6d12fdf3658efdb1a80772f7737db1f8510b876df6"},
-    {file = "grpcio_tools-1.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:1c0e8a1a32973a5d59fbcc19232f925e5c48116e9411f788033a31c5ca5130b4"},
+    {file = "grpcio-tools-1.62.2.tar.gz", hash = "sha256:5fd5e1582b678e6b941ee5f5809340be5e0724691df5299aae8226640f94e18f"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-linux_armv7l.whl", hash = "sha256:1679b4903aed2dc5bd8cb22a452225b05dc8470a076f14fd703581efc0740cdb"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:9d41e0e47dd075c075bb8f103422968a65dd0d8dc8613288f573ae91eb1053ba"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:987e774f74296842bbffd55ea8826370f70c499e5b5f71a8cf3103838b6ee9c3"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd4eeea4b25bcb6903b82930d579027d034ba944393c4751cdefd9c49e6989"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6746bc823958499a3cf8963cc1de00072962fb5e629f26d658882d3f4c35095"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2ed775e844566ce9ce089be9a81a8b928623b8ee5820f5e4d58c1a9d33dfc5ae"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bdc5dd3f57b5368d5d661d5d3703bcaa38bceca59d25955dff66244dbc987271"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-win32.whl", hash = "sha256:3a8d6f07e64c0c7756f4e0c4781d9d5a2b9cc9cbd28f7032a6fb8d4f847d0445"},
+    {file = "grpcio_tools-1.62.2-cp310-cp310-win_amd64.whl", hash = "sha256:e33b59fb3efdddeb97ded988a871710033e8638534c826567738d3edce528752"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-linux_armv7l.whl", hash = "sha256:472505d030135d73afe4143b0873efe0dcb385bd6d847553b4f3afe07679af00"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:ec674b4440ef4311ac1245a709e87b36aca493ddc6850eebe0b278d1f2b6e7d1"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:184b4174d4bd82089d706e8223e46c42390a6ebac191073b9772abc77308f9fa"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c195d74fe98541178ece7a50dad2197d43991e0f77372b9a88da438be2486f12"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a34d97c62e61bfe9e6cff0410fe144ac8cca2fc979ad0be46b7edf026339d161"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cbb8453ae83a1db2452b7fe0f4b78e4a8dd32be0f2b2b73591ae620d4d784d3d"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4f989e5cebead3ae92c6abf6bf7b19949e1563a776aea896ac5933f143f0c45d"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-win32.whl", hash = "sha256:c48fabe40b9170f4e3d7dd2c252e4f1ff395dc24e49ac15fc724b1b6f11724da"},
+    {file = "grpcio_tools-1.62.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c616d0ad872e3780693fce6a3ac8ef00fc0963e6d7815ce9dcfae68ba0fc287"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-linux_armv7l.whl", hash = "sha256:10cc3321704ecd17c93cf68c99c35467a8a97ffaaed53207e9b2da6ae0308ee1"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:9be84ff6d47fd61462be7523b49d7ba01adf67ce4e1447eae37721ab32464dd8"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:d82f681c9a9d933a9d8068e8e382977768e7779ddb8870fa0cf918d8250d1532"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:04c607029ae3660fb1624ed273811ffe09d57d84287d37e63b5b802a35897329"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72b61332f1b439c14cbd3815174a8f1d35067a02047c32decd406b3a09bb9890"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8214820990d01b52845f9fbcb92d2b7384a0c321b303e3ac614c219dc7d1d3af"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:462e0ab8dd7c7b70bfd6e3195eebc177549ede5cf3189814850c76f9a340d7ce"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-win32.whl", hash = "sha256:fa107460c842e4c1a6266150881694fefd4f33baa544ea9489601810c2210ef8"},
+    {file = "grpcio_tools-1.62.2-cp312-cp312-win_amd64.whl", hash = "sha256:759c60f24c33a181bbbc1232a6752f9b49fbb1583312a4917e2b389fea0fb0f2"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-linux_armv7l.whl", hash = "sha256:45db5da2bcfa88f2b86b57ef35daaae85c60bd6754a051d35d9449c959925b57"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:ab84bae88597133f6ea7a2bdc57b2fda98a266fe8d8d4763652cbefd20e73ad7"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:7a49bccae1c7d154b78e991885c3111c9ad8c8fa98e91233de425718f47c6139"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7e439476b29d6dac363b321781a113794397afceeb97dad85349db5f1cb5e9a"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ea369c4d1567d1acdf69c8ea74144f4ccad9e545df7f9a4fc64c94fa7684ba3"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4f955702dc4b530696375251319d05223b729ed24e8673c2129f7a75d2caefbb"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3708a747aa4b6b505727282ca887041174e146ae030ebcadaf4c1d346858df62"},
+    {file = "grpcio_tools-1.62.2-cp37-cp37m-win_amd64.whl", hash = "sha256:2ce149ea55eadb486a7fb75a20f63ef3ac065ee6a0240ed25f3549ce7954c653"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-linux_armv7l.whl", hash = "sha256:58cbb24b3fa6ae35aa9c210fcea3a51aa5fef0cd25618eb4fd94f746d5a9b703"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:6413581e14a80e0b4532577766cf0586de4dd33766a31b3eb5374a746771c07d"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:47117c8a7e861382470d0e22d336e5a91fdc5f851d1db44fa784b9acea190d87"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f1ba79a253df9e553d20319c615fa2b429684580fa042dba618d7f6649ac7e4"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:04a394cf5e51ba9be412eb9f6c482b6270bd81016e033e8eb7d21b8cc28fe8b5"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3c53b221378b035ae2f1881cbc3aca42a6075a8e90e1a342c2f205eb1d1aa6a1"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c384c838b34d1b67068e51b5bbe49caa6aa3633acd158f1ab16b5da8d226bc53"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-win32.whl", hash = "sha256:19ea69e41c3565932aa28a202d1875ec56786aea46a2eab54a3b28e8a27f9517"},
+    {file = "grpcio_tools-1.62.2-cp38-cp38-win_amd64.whl", hash = "sha256:1d768a5c07279a4c461ebf52d0cec1c6ca85c6291c71ec2703fe3c3e7e28e8c4"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-linux_armv7l.whl", hash = "sha256:5b07b5874187e170edfbd7aa2ca3a54ebf3b2952487653e8c0b0d83601c33035"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:d58389fe8be206ddfb4fa703db1e24c956856fcb9a81da62b13577b3a8f7fda7"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:7d8b4e00c3d7237b92260fc18a561cd81f1da82e8be100db1b7d816250defc66"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fe08d2038f2b7c53259b5c49e0ad08c8e0ce2b548d8185993e7ef67e8592cca"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19216e1fb26dbe23d12a810517e1b3fbb8d4f98b1a3fbebeec9d93a79f092de4"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b8574469ecc4ff41d6bb95f44e0297cdb0d95bade388552a9a444db9cd7485cd"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4f6f32d39283ea834a493fccf0ebe9cfddee7577bdcc27736ad4be1732a36399"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-win32.whl", hash = "sha256:76eb459bdf3fb666e01883270beee18f3f11ed44488486b61cd210b4e0e17cc1"},
+    {file = "grpcio_tools-1.62.2-cp39-cp39-win_amd64.whl", hash = "sha256:217c2ee6a7ce519a55958b8622e21804f6fdb774db08c322f4c9536c35fdce7c"},
 ]
 
 [package.dependencies]
-grpcio = ">=1.57.0"
+grpcio = ">=1.62.2"
 protobuf = ">=4.21.6,<5.0dev"
 setuptools = "*"
 
 [[package]]
 name = "hf-transfer"
-version = "0.1.3"
+version = "0.1.6"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "hf_transfer-0.1.3-cp310-cp310-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:862b6ddba8e236bdc73408c20d020cfe5069cac3fd0b6de901c46f031df2b7d9"},
-    {file = "hf_transfer-0.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:569ef1ec6fec182e706ade4ea0c63f8510fd618ed7ced7c772efaafac7245b07"},
-    {file = "hf_transfer-0.1.3-cp310-none-win_amd64.whl", hash = "sha256:c9faa88b3491c50d4aa75faf18ae24040cd91aa0565c7f7ba2357dbcbf8372f6"},
-    {file = "hf_transfer-0.1.3-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:d53954ecfaadc84c15481bf5d4c7282323196b4b6df1d1be54208d4fdedfb407"},
-    {file = "hf_transfer-0.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:334862f4a82f8a09d6d3f550e67d7e498bb8882e678b7725638254fed3276801"},
-    {file = "hf_transfer-0.1.3-cp311-none-win_amd64.whl", hash = "sha256:da92a1483a66cf2baa96de133c75e7d5d9a60e4a0e60d228f26c573c73a1feb6"},
-    {file = "hf_transfer-0.1.3-cp37-cp37m-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:617692a70cf299576d82cfc860923f29ec5c834a3f5242bc0853d4f106670398"},
-    {file = "hf_transfer-0.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca218fb6efc5c22379d9e64989e84bff426fcf5664fdbbf7cd70aa8b79497652"},
-    {file = "hf_transfer-0.1.3-cp37-none-win_amd64.whl", hash = "sha256:6e5201b648df6106c232fcdb507db734081fd6220dfb1c432bd27c6fa9453331"},
-    {file = "hf_transfer-0.1.3-cp38-cp38-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:445edfcd9d59c9d2541957177a9c9225b1f0e8855f6311fb16e20f67c3426421"},
-    {file = "hf_transfer-0.1.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c1bdfa554f0b0936c1623b99058c6998a00fdcd86f75d9203f3f66572d2e30c"},
-    {file = "hf_transfer-0.1.3-cp38-none-win_amd64.whl", hash = "sha256:606f2fe436e5be73f07987a56cd97c695805413d29203ae39ebd9fc596405435"},
-    {file = "hf_transfer-0.1.3-cp39-cp39-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:9913f5ad2644a1f57c1b7755a7d959ca5e0189863bb0473817d0707af230bf6a"},
-    {file = "hf_transfer-0.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d935946791e338f748e05a23df877d74fbcd39dc7b537f0aa2e5a5841cf7dde8"},
-    {file = "hf_transfer-0.1.3-cp39-none-win_amd64.whl", hash = "sha256:79099ac043423b263a2843a24213418f309d5c8bc458776622bffe012ebced73"},
-    {file = "hf_transfer-0.1.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ce6c5965a57d94db5e043aa488a4df929a32000db125d9c9a1d325e8c7006dc"},
-    {file = "hf_transfer-0.1.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a7934c8e491bb395731f677f66dd5f6641432f338a3a9efc9f0b6c186d37cf8"},
-    {file = "hf_transfer-0.1.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efb8b41360c7e3d7700c147b70688aed0a03e86fbe5bcfdee079b0e634f026f9"},
-    {file = "hf_transfer-0.1.3.tar.gz", hash = "sha256:7afd7eb03efad7812a48591b639b2e3f3d1f93c1e9060c18cc63ebf08d7e193c"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6fd3d61f9229d27def007e53540412507b74ac2fdb1a29985ae0b6a5137749a2"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b043bb78df1225de043eb041de9d97783fcca14a0bdc1b1d560fc172fc21b648"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7db60dd18eae4fa6ea157235fb82196cde5313995b396d1b591aad3b790a7f8f"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:30d31dbab9b5a558cce407b8728e39d87d7af1ef8745ddb90187e9ae0b9e1e90"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f6b368bddd757efc7af3126ba81f9ac8f9435e2cc00902cb3d64f2be28d8f719"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa2086d8aefaaa3e144e167324574882004c0cec49bf2d0638ec4b74732d8da0"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:45d8985a0940bfe1535cb4ca781f5c11e47c83798ef3373ee1f5d57bbe527a9c"},
+    {file = "hf_transfer-0.1.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f42b89735f1cde22f2a795d1f0915741023235666be7de45879e533c7d6010c"},
+    {file = "hf_transfer-0.1.6-cp310-none-win32.whl", hash = "sha256:2d2c4c4613f3ad45b6ce6291e347b2d3ba1b86816635681436567e461cb3c961"},
+    {file = "hf_transfer-0.1.6-cp310-none-win_amd64.whl", hash = "sha256:78b0eed8d8dce60168a46e584b9742b816af127d7e410a713e12c31249195342"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f1d8c172153f9a6cdaecf137612c42796076f61f6bea1072c90ac2e17c1ab6fa"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2c601996351f90c514a75a0eeb02bf700b1ad1db2d946cbfe4b60b79e29f0b2f"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e585c808405557d3f5488f385706abb696997bbae262ea04520757e30836d9d"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec51af1e8cf4268c268bd88932ade3d7ca895a3c661b42493503f02610ae906b"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d106fdf996332f6df3ed3fab6d6332df82e8c1fb4b20fd81a491ca4d2ab5616a"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e9c2ee9e9fde5a0319cc0e8ddfea10897482bc06d5709b10a238f1bc2ebcbc0b"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f394ea32bc7802b061e549d3133efc523b4ae4fd19bf4b74b183ca6066eef94e"},
+    {file = "hf_transfer-0.1.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4282f09902114cd67fca98a1a1bad569a44521a8395fedf327e966714f68b977"},
+    {file = "hf_transfer-0.1.6-cp311-none-win32.whl", hash = "sha256:276dbf307d5ab6f1bcbf57b5918bfcf9c59d6848ccb28242349e1bb5985f983b"},
+    {file = "hf_transfer-0.1.6-cp311-none-win_amd64.whl", hash = "sha256:fa475175c51451186bea804471995fa8e7b2a48a61dcca55534911dc25955527"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:23d157a67acfa00007799323a1c441b2bbacc7dee625b016b7946fe0e25e6c89"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6067342a2864b988f861cd2d31bd78eb1e84d153a3f6df38485b6696d9ad3013"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91cfcb3070e205b58fa8dc8bcb6a62ccc40913fcdb9cd1ff7c364c8e3aa85345"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb76064ac5165d5eeaaf8d0903e8bf55477221ecc2a4a4d69f0baca065ab905b"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dabd3a177d83028f164984cf4dd859f77ec1e20c97a6f307ff8fcada0785ef1"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0bf4254e44f64a26e0a5b73b5d7e8d91bb36870718fb4f8e126ec943ff4c805"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d32c1b106f38f336ceb21531f4db9b57d777b9a33017dafdb6a5316388ebe50"},
+    {file = "hf_transfer-0.1.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff05aba3c83921e5c7635ba9f07c693cc893350c447644824043aeac27b285f5"},
+    {file = "hf_transfer-0.1.6-cp312-none-win32.whl", hash = "sha256:051ef0c55607652cb5974f59638da035773254b9a07d7ee5b574fe062de4c9d1"},
+    {file = "hf_transfer-0.1.6-cp312-none-win_amd64.whl", hash = "sha256:716fb5c574fcbdd8092ce73f9b6c66f42e3544337490f77c60ec07df02bd081b"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c0c981134a55965e279cb7be778c1ccaf93f902fc9ebe31da4f30caf824cc4d"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ef1f145f04c5b573915bcb1eb5db4039c74f6b46fce73fc473c4287e613b623"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0a7609b004db3347dbb7796df45403eceb171238210d054d93897d6d84c63a4"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60f0864bf5996773dbd5f8ae4d1649041f773fe9d5769f4c0eeb5553100acef3"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d01e55d630ffe70a4f5d0ed576a04c6a48d7c65ca9a7d18f2fca385f20685a9"},
+    {file = "hf_transfer-0.1.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d855946c5062b665190de15b2bdbd4c8eddfee35350bfb7564592e23d36fbbd3"},
+    {file = "hf_transfer-0.1.6-cp37-none-win32.whl", hash = "sha256:fd40b2409cfaf3e8aba20169ee09552f69140e029adeec261b988903ff0c8f6f"},
+    {file = "hf_transfer-0.1.6-cp37-none-win_amd64.whl", hash = "sha256:0e0eba49d46d3b5481919aea0794aec625fbc6ecdf13fe7e0e9f3fc5d5ad5971"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e669fecb29fc454449739f9f53ed9253197e7c19e6a6eaa0f08334207af4287"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:89f701802892e5eb84f89f402686861f87dc227d6082b05f4e9d9b4e8015a3c3"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6f2b0c8b95b01409275d789a9b74d5f2e146346f985d384bf50ec727caf1ccc"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa855a2fa262792a230f9efcdb5da6d431b747d1861d2a69fe7834b19aea077e"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa8ca349afb2f0713475426946261eb2035e4efb50ebd2c1d5ad04f395f4217"},
+    {file = "hf_transfer-0.1.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01255f043996bc7d1bae62d8afc5033a90c7e36ce308b988eeb84afe0a69562f"},
+    {file = "hf_transfer-0.1.6-cp38-none-win32.whl", hash = "sha256:60b1db183e8a7540cd4f8b2160ff4de55f77cb0c3fc6a10be1e7c30eb1b2bdeb"},
+    {file = "hf_transfer-0.1.6-cp38-none-win_amd64.whl", hash = "sha256:fb8be3cba6aaa50ab2e9dffbd25c8eb2046785eeff642cf0cdd0dd9ae6be3539"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d09af35e3e3f09b664e6429e9a0dc200f29c5bdfd88bdd9666de51183b1fe202"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a4505bd707cc14d85c800f961fad8ca76f804a8ad22fbb7b1a217d8d0c15e6a5"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c453fd8b0be9740faa23cecd1f28ee9ead7d900cefa64ff836960c503a744c9"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:13cb8884e718a78c3b81a8cdec9c7ac196dd42961fce55c3ccff3dd783e5ad7a"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39cd39df171a2b5404de69c4e6cd14eee47f6fe91c1692f939bfb9e59a0110d8"},
+    {file = "hf_transfer-0.1.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ff0629ee9f98df57a783599602eb498f9ec3619dc69348b12e4d9d754abf0e9"},
+    {file = "hf_transfer-0.1.6-cp39-none-win32.whl", hash = "sha256:164a6ce445eb0cc7c645f5b6e1042c003d33292520c90052b6325f30c98e4c5f"},
+    {file = "hf_transfer-0.1.6-cp39-none-win_amd64.whl", hash = "sha256:11b8b4b73bf455f13218c5f827698a30ae10998ca31b8264b51052868c7a9f11"},
+    {file = "hf_transfer-0.1.6-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16957ba057376a99ea361074ce1094f61b58e769defa6be2422ae59c0b6a6530"},
+    {file = "hf_transfer-0.1.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7db952112e3b8ee1a5cbf500d2443e9ce4fb893281c5310a3e31469898628005"},
+    {file = "hf_transfer-0.1.6-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d39d826a7344f5e39f438d62632acd00467aa54a083b66496f61ef67a9885a56"},
+    {file = "hf_transfer-0.1.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e2653fbfa92e7651db73d99b697c8684e7345c479bd6857da80bed6138abb2"},
+    {file = "hf_transfer-0.1.6-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:144277e6a86add10b90ec3b583253aec777130312256bfc8d5ade5377e253807"},
+    {file = "hf_transfer-0.1.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bb53bcd16365313b2aa0dbdc28206f577d70770f31249cdabc387ac5841edcc"},
+    {file = "hf_transfer-0.1.6-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:990d73a5a68d8261980f146c51f4c5f9995314011cb225222021ad7c39f3af2d"},
+    {file = "hf_transfer-0.1.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:652406037029ab9b4097b4c5f29321bad5f64c2b46fbff142509d918aec87c29"},
+    {file = "hf_transfer-0.1.6.tar.gz", hash = "sha256:deb505a7d417d7055fd7b3549eadb91dfe782941261f3344025c486c16d1d2f9"},
 ]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.16.4"
+version = "0.23.2"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.16.4-py3-none-any.whl", hash = "sha256:0d3df29932f334fead024afc7cb4cc5149d955238b8b5e42dcf9740d6995a349"},
-    {file = "huggingface_hub-0.16.4.tar.gz", hash = "sha256:608c7d4f3d368b326d1747f91523dbd1f692871e8e2e7a4750314a2dd8b63e14"},
+    {file = "huggingface_hub-0.23.2-py3-none-any.whl", hash = "sha256:48727a16e704d409c4bb5913613308499664f22a99743435dc3a13b23c485827"},
+    {file = "huggingface_hub-0.23.2.tar.gz", hash = "sha256:f6829b62d5fdecb452a76fdbec620cba4c1573655a8d710c1df71735fd9edbd2"},
 ]
 
 [package.dependencies]
 filelock = "*"
-fsspec = "*"
+fsspec = ">=2023.5.0"
 packaging = ">=20.9"
 pyyaml = ">=5.1"
 requests = "*"
@@ -776,28 +883,49 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-inference = ["aiohttp", "pydantic"]
-quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"]
+hf-transfer = ["hf-transfer (>=0.1.4)"]
+inference = ["aiohttp", "minijinja (>=1.0)"]
+quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["torch"]
-typing = ["pydantic", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+tensorflow-testing = ["keras (<3.0)", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["safetensors", "torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
 
 [[package]]
 name = "idna"
-version = "3.4"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
-    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
+[[package]]
+name = "importlib-metadata"
+version = "7.1.0"
+description = "Read metadata from Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"},
+    {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"},
+]
+
+[package.dependencies]
+zipp = ">=0.5"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+perf = ["ipython"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
+
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -810,14 +938,39 @@ files = [
 ]
 
 [[package]]
-name = "jinja2"
-version = "3.1.2"
-description = "A very fast and expressive template engine."
-optional = false
+name = "intel-openmp"
+version = "2021.4.0"
+description = "Intel OpenMP* Runtime Library"
+optional = true
+python-versions = "*"
+files = [
+    {file = "intel_openmp-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:41c01e266a7fdb631a7609191709322da2bbf24b252ba763f125dd651bcc7675"},
+    {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:3b921236a38384e2016f0f3d65af6732cf2c12918087128a9163225451e776f2"},
+    {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:e2240ab8d01472fed04f3544a878cda5da16c26232b7ea1b59132dbfb48b186e"},
+    {file = "intel_openmp-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:6e863d8fd3d7e8ef389d52cf97a50fe2afe1a19247e8c0d168ce021546f96fc9"},
+    {file = "intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:eef4c8bcc8acefd7f5cd3b9384dbf73d59e2c99fc56545712ded913f43c4a94f"},
+]
+
+[[package]]
+name = "interegular"
+version = "0.3.3"
+description = "a regex intersection checker"
+optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
-    {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
+    {file = "interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c"},
+    {file = "interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600"},
+]
+
+[[package]]
+name = "jinja2"
+version = "3.1.4"
+description = "A very fast and expressive template engine."
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
+    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
 ]
 
 [package.dependencies]
@@ -826,6 +979,99 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 
+[[package]]
+name = "joblib"
+version = "1.4.2"
+description = "Lightweight pipelining with Python functions"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
+    {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
+]
+
+[[package]]
+name = "jsonschema"
+version = "4.22.0"
+description = "An implementation of JSON Schema validation for Python"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "jsonschema-4.22.0-py3-none-any.whl", hash = "sha256:ff4cfd6b1367a40e7bc6411caec72effadd3db0bbe5017de188f2d6108335802"},
+    {file = "jsonschema-4.22.0.tar.gz", hash = "sha256:5b22d434a45935119af990552c862e5d6d564e8f6601206b305a61fdf661a2b7"},
+]
+
+[package.dependencies]
+attrs = ">=22.2.0"
+jsonschema-specifications = ">=2023.03.6"
+referencing = ">=0.28.4"
+rpds-py = ">=0.7.1"
+
+[package.extras]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+
+[[package]]
+name = "jsonschema-specifications"
+version = "2023.12.1"
+description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "jsonschema_specifications-2023.12.1-py3-none-any.whl", hash = "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c"},
+    {file = "jsonschema_specifications-2023.12.1.tar.gz", hash = "sha256:48a76787b3e70f5ed53f1160d2b81f586e4ca6d1548c5de7085d1682674764cc"},
+]
+
+[package.dependencies]
+referencing = ">=0.31.0"
+
+[[package]]
+name = "lark"
+version = "1.1.9"
+description = "a modern parsing library"
+optional = true
+python-versions = ">=3.6"
+files = [
+    {file = "lark-1.1.9-py3-none-any.whl", hash = "sha256:a0dd3a87289f8ccbb325901e4222e723e7d745dbfc1803eaf5f3d2ace19cf2db"},
+    {file = "lark-1.1.9.tar.gz", hash = "sha256:15fa5236490824c2c4aba0e22d2d6d823575dcaf4cdd1848e34b6ad836240fba"},
+]
+
+[package.extras]
+atomic-cache = ["atomicwrites"]
+interegular = ["interegular (>=0.3.1,<0.4.0)"]
+nearley = ["js2py"]
+regex = ["regex"]
+
+[[package]]
+name = "llvmlite"
+version = "0.42.0"
+description = "lightweight wrapper around basic LLVM functionality"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "llvmlite-0.42.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3366938e1bf63d26c34fbfb4c8e8d2ded57d11e0567d5bb243d89aab1eb56098"},
+    {file = "llvmlite-0.42.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c35da49666a21185d21b551fc3caf46a935d54d66969d32d72af109b5e7d2b6f"},
+    {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70f44ccc3c6220bd23e0ba698a63ec2a7d3205da0d848804807f37fc243e3f77"},
+    {file = "llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f8d8717a9073b9e0246998de89929071d15b47f254c10eef2310b9aac033d"},
+    {file = "llvmlite-0.42.0-cp310-cp310-win_amd64.whl", hash = "sha256:8d90edf400b4ceb3a0e776b6c6e4656d05c7187c439587e06f86afceb66d2be5"},
+    {file = "llvmlite-0.42.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ae511caed28beaf1252dbaf5f40e663f533b79ceb408c874c01754cafabb9cbf"},
+    {file = "llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81e674c2fe85576e6c4474e8c7e7aba7901ac0196e864fe7985492b737dbab65"},
+    {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb3975787f13eb97629052edb5017f6c170eebc1c14a0433e8089e5db43bcce6"},
+    {file = "llvmlite-0.42.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5bece0cdf77f22379f19b1959ccd7aee518afa4afbd3656c6365865f84903f9"},
+    {file = "llvmlite-0.42.0-cp311-cp311-win_amd64.whl", hash = "sha256:7e0c4c11c8c2aa9b0701f91b799cb9134a6a6de51444eff5a9087fc7c1384275"},
+    {file = "llvmlite-0.42.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:08fa9ab02b0d0179c688a4216b8939138266519aaa0aa94f1195a8542faedb56"},
+    {file = "llvmlite-0.42.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b2fce7d355068494d1e42202c7aff25d50c462584233013eb4470c33b995e3ee"},
+    {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebe66a86dc44634b59a3bc860c7b20d26d9aaffcd30364ebe8ba79161a9121f4"},
+    {file = "llvmlite-0.42.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47494552559e00d81bfb836cf1c4d5a5062e54102cc5767d5aa1e77ccd2505c"},
+    {file = "llvmlite-0.42.0-cp312-cp312-win_amd64.whl", hash = "sha256:05cb7e9b6ce69165ce4d1b994fbdedca0c62492e537b0cc86141b6e2c78d5888"},
+    {file = "llvmlite-0.42.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bdd3888544538a94d7ec99e7c62a0cdd8833609c85f0c23fcb6c5c591aec60ad"},
+    {file = "llvmlite-0.42.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0936c2067a67fb8816c908d5457d63eba3e2b17e515c5fe00e5ee2bace06040"},
+    {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a78ab89f1924fc11482209f6799a7a3fc74ddc80425a7a3e0e8174af0e9e2301"},
+    {file = "llvmlite-0.42.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7599b65c7af7abbc978dbf345712c60fd596aa5670496561cc10e8a71cebfb2"},
+    {file = "llvmlite-0.42.0-cp39-cp39-win_amd64.whl", hash = "sha256:43d65cc4e206c2e902c1004dd5418417c4efa6c1d04df05c6c5675a27e8ca90e"},
+    {file = "llvmlite-0.42.0.tar.gz", hash = "sha256:f92b09243c0cc3f457da8b983f67bd8e1295d0f5b3746c7a1861d7a99403854a"},
+]
+
 [[package]]
 name = "loguru"
 version = "0.6.0"
@@ -846,68 +1092,96 @@ dev = ["Sphinx (>=4.1.1)", "black (>=19.10b0)", "colorama (>=0.3.4)", "docutils
 
 [[package]]
 name = "markupsafe"
-version = "2.1.3"
+version = "2.1.5"
 description = "Safely add untrusted strings to HTML/XML markup."
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-win32.whl", hash = "sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431"},
-    {file = "MarkupSafe-2.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
-    {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-win32.whl", hash = "sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0"},
-    {file = "MarkupSafe-2.1.3-cp37-cp37m-win_amd64.whl", hash = "sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-win32.whl", hash = "sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5"},
-    {file = "MarkupSafe-2.1.3-cp38-cp38-win_amd64.whl", hash = "sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-win32.whl", hash = "sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2"},
-    {file = "MarkupSafe-2.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba"},
-    {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
+    {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
+    {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
+    {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
+    {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
+    {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
+    {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
+    {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
 ]
 
+[[package]]
+name = "mkl"
+version = "2021.4.0"
+description = "Intel® oneAPI Math Kernel Library"
+optional = true
+python-versions = "*"
+files = [
+    {file = "mkl-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:67460f5cd7e30e405b54d70d1ed3ca78118370b65f7327d495e9c8847705e2fb"},
+    {file = "mkl-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:636d07d90e68ccc9630c654d47ce9fdeb036bb46e2b193b3a9ac8cfea683cce5"},
+    {file = "mkl-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:398dbf2b0d12acaf54117a5210e8f191827f373d362d796091d161f610c1ebfb"},
+    {file = "mkl-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:439c640b269a5668134e3dcbcea4350459c4a8bc46469669b2d67e07e3d330e8"},
+    {file = "mkl-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:ceef3cafce4c009dd25f65d7ad0d833a0fbadc3d8903991ec92351fe5de1e718"},
+]
+
+[package.dependencies]
+intel-openmp = "==2021.*"
+tbb = "==2021.*"
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
 description = "Python library for arbitrary-precision floating-point arithmetic"
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
@@ -922,85 +1196,101 @@ tests = ["pytest (>=4.6)"]
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.0.5"
 description = "multidict implementation"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
-    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
-    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
-    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
-    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
+    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
+    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
+    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
+    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"},
+    {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"},
+    {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"},
+    {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"},
+    {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"},
+    {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"},
+    {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"},
+    {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
+    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
+    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
+    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
+    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
 [[package]]
@@ -1032,140 +1322,348 @@ files = [
 dill = ">=0.3.7"
 
 [[package]]
-name = "networkx"
-version = "3.1"
-description = "Python package for creating and manipulating graphs and networks"
-optional = false
-python-versions = ">=3.8"
+name = "nest-asyncio"
+version = "1.6.0"
+description = "Patch asyncio to allow nested event loops"
+optional = true
+python-versions = ">=3.5"
 files = [
-    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
-    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+    {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
+    {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
+]
+
+[[package]]
+name = "networkx"
+version = "3.2.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"},
+    {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"},
 ]
 
 [package.extras]
-default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
-developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
-doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
-extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
-test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"]
+developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"]
+doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
+test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
+[[package]]
+name = "numba"
+version = "0.59.1"
+description = "compiling Python code using LLVM"
+optional = true
+python-versions = ">=3.9"
+files = [
+    {file = "numba-0.59.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:97385a7f12212c4f4bc28f648720a92514bee79d7063e40ef66c2d30600fd18e"},
+    {file = "numba-0.59.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0b77aecf52040de2a1eb1d7e314497b9e56fba17466c80b457b971a25bb1576d"},
+    {file = "numba-0.59.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3476a4f641bfd58f35ead42f4dcaf5f132569c4647c6f1360ccf18ee4cda3990"},
+    {file = "numba-0.59.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:525ef3f820931bdae95ee5379c670d5c97289c6520726bc6937a4a7d4230ba24"},
+    {file = "numba-0.59.1-cp310-cp310-win_amd64.whl", hash = "sha256:990e395e44d192a12105eca3083b61307db7da10e093972ca285c85bef0963d6"},
+    {file = "numba-0.59.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:43727e7ad20b3ec23ee4fc642f5b61845c71f75dd2825b3c234390c6d8d64051"},
+    {file = "numba-0.59.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:411df625372c77959570050e861981e9d196cc1da9aa62c3d6a836b5cc338966"},
+    {file = "numba-0.59.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2801003caa263d1e8497fb84829a7ecfb61738a95f62bc05693fcf1733e978e4"},
+    {file = "numba-0.59.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dd2842fac03be4e5324ebbbd4d2d0c8c0fc6e0df75c09477dd45b288a0777389"},
+    {file = "numba-0.59.1-cp311-cp311-win_amd64.whl", hash = "sha256:0594b3dfb369fada1f8bb2e3045cd6c61a564c62e50cf1f86b4666bc721b3450"},
+    {file = "numba-0.59.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1cce206a3b92836cdf26ef39d3a3242fec25e07f020cc4feec4c4a865e340569"},
+    {file = "numba-0.59.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8c8b4477763cb1fbd86a3be7050500229417bf60867c93e131fd2626edb02238"},
+    {file = "numba-0.59.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d80bce4ef7e65bf895c29e3889ca75a29ee01da80266a01d34815918e365835"},
+    {file = "numba-0.59.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f7ad1d217773e89a9845886401eaaab0a156a90aa2f179fdc125261fd1105096"},
+    {file = "numba-0.59.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bf68f4d69dd3a9f26a9b23548fa23e3bcb9042e2935257b471d2a8d3c424b7f"},
+    {file = "numba-0.59.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4e0318ae729de6e5dbe64c75ead1a95eb01fabfe0e2ebed81ebf0344d32db0ae"},
+    {file = "numba-0.59.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0f68589740a8c38bb7dc1b938b55d1145244c8353078eea23895d4f82c8b9ec1"},
+    {file = "numba-0.59.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:649913a3758891c77c32e2d2a3bcbedf4a69f5fea276d11f9119677c45a422e8"},
+    {file = "numba-0.59.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9712808e4545270291d76b9a264839ac878c5eb7d8b6e02c970dc0ac29bc8187"},
+    {file = "numba-0.59.1-cp39-cp39-win_amd64.whl", hash = "sha256:8d51ccd7008a83105ad6a0082b6a2b70f1142dc7cfd76deb8c5a862367eb8c86"},
+    {file = "numba-0.59.1.tar.gz", hash = "sha256:76f69132b96028d2774ed20415e8c528a34e3299a40581bae178f0994a2f370b"},
+]
+
+[package.dependencies]
+llvmlite = "==0.42.*"
+numpy = ">=1.22,<1.27"
 
 [[package]]
 name = "numpy"
-version = "1.25.2"
+version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:db3ccc4e37a6873045580d413fe79b68e47a681af8db2e046f1dacfa11f86eb3"},
-    {file = "numpy-1.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:90319e4f002795ccfc9050110bbbaa16c944b1c37c0baeea43c5fb881693ae1f"},
-    {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfe4a913e29b418d096e696ddd422d8a5d13ffba4ea91f9f60440a3b759b0187"},
-    {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f08f2e037bba04e707eebf4bc934f1972a315c883a9e0ebfa8a7756eabf9e357"},
-    {file = "numpy-1.25.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bec1e7213c7cb00d67093247f8c4db156fd03075f49876957dca4711306d39c9"},
-    {file = "numpy-1.25.2-cp310-cp310-win32.whl", hash = "sha256:7dc869c0c75988e1c693d0e2d5b26034644399dd929bc049db55395b1379e044"},
-    {file = "numpy-1.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:834b386f2b8210dca38c71a6e0f4fd6922f7d3fcff935dbe3a570945acb1b545"},
-    {file = "numpy-1.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5462d19336db4560041517dbb7759c21d181a67cb01b36ca109b2ae37d32418"},
-    {file = "numpy-1.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5652ea24d33585ea39eb6a6a15dac87a1206a692719ff45d53c5282e66d4a8f"},
-    {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d60fbae8e0019865fc4784745814cff1c421df5afee233db6d88ab4f14655a2"},
-    {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e7f0f7f6d0eee8364b9a6304c2845b9c491ac706048c7e8cf47b83123b8dbf"},
-    {file = "numpy-1.25.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb33d5a1cf360304754913a350edda36d5b8c5331a8237268c48f91253c3a364"},
-    {file = "numpy-1.25.2-cp311-cp311-win32.whl", hash = "sha256:5883c06bb92f2e6c8181df7b39971a5fb436288db58b5a1c3967702d4278691d"},
-    {file = "numpy-1.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:5c97325a0ba6f9d041feb9390924614b60b99209a71a69c876f71052521d42a4"},
-    {file = "numpy-1.25.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b79e513d7aac42ae918db3ad1341a015488530d0bb2a6abcbdd10a3a829ccfd3"},
-    {file = "numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb942bfb6f84df5ce05dbf4b46673ffed0d3da59f13635ea9b926af3deb76926"},
-    {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0746410e73384e70d286f93abf2520035250aad8c5714240b0492a7302fdca"},
-    {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7806500e4f5bdd04095e849265e55de20d8cc4b661b038957354327f6d9b295"},
-    {file = "numpy-1.25.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8b77775f4b7df768967a7c8b3567e309f617dd5e99aeb886fa14dc1a0791141f"},
-    {file = "numpy-1.25.2-cp39-cp39-win32.whl", hash = "sha256:2792d23d62ec51e50ce4d4b7d73de8f67a2fd3ea710dcbc8563a51a03fb07b01"},
-    {file = "numpy-1.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:76b4115d42a7dfc5d485d358728cdd8719be33cc5ec6ec08632a5d6fca2ed380"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a1329e26f46230bf77b02cc19e900db9b52f398d6722ca853349a782d4cff55"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c3abc71e8b6edba80a01a52e66d83c5d14433cbcd26a40c329ec7ed09f37901"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1b9735c27cea5d995496f46a8b1cd7b408b3f34b6d50459d9ac8fe3a20cc17bf"},
-    {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
+]
+
+[[package]]
+name = "nvidia-cublas-cu12"
+version = "12.1.3.1"
+description = "CUBLAS native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
+    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
+]
+
+[[package]]
+name = "nvidia-cuda-cupti-cu12"
+version = "12.1.105"
+description = "CUDA profiling tools runtime libs."
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
+    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
+]
+
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.1.105"
+description = "NVRTC native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
+]
+
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.1.105"
+description = "CUDA Runtime native Libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
+    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
+]
+
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "8.9.2.26"
+description = "cuDNN runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl", hash = "sha256:5ccb288774fdfb07a7e7025ffec286971c06d8d7b4fb162525334616d7629ff9"},
+]
+
+[package.dependencies]
+nvidia-cublas-cu12 = "*"
+
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.0.2.54"
+description = "CUFFT native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
+    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
+]
+
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.2.106"
+description = "CURAND native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
+    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
+]
+
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.4.5.107"
+description = "CUDA solver native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
+    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
+]
+
+[package.dependencies]
+nvidia-cublas-cu12 = "*"
+nvidia-cusparse-cu12 = "*"
+nvidia-nvjitlink-cu12 = "*"
+
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.1.0.106"
+description = "CUSPARSE native runtime libraries"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
+    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
+]
+
+[package.dependencies]
+nvidia-nvjitlink-cu12 = "*"
+
+[[package]]
+name = "nvidia-nccl-cu12"
+version = "2.20.5"
+description = "NVIDIA Collective Communication Library (NCCL) Runtime"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
+    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
+]
+
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.5.40"
+description = "Nvidia JIT LTO Library"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d9714f27c1d0f0895cd8915c07a87a1d0029a0aa36acaf9156952ec2a8a12189"},
+    {file = "nvidia_nvjitlink_cu12-12.5.40-py3-none-win_amd64.whl", hash = "sha256:c3401dc8543b52d3a8158007a0c1ab4e9c768fcbd24153a48c86972102197ddd"},
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.1.105"
+description = "NVIDIA Tools Extension"
+optional = true
+python-versions = ">=3"
+files = [
+    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
+    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
 ]
 
 [[package]]
 name = "opentelemetry-api"
-version = "1.15.0"
+version = "1.25.0"
 description = "OpenTelemetry Python API"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_api-1.15.0-py3-none-any.whl", hash = "sha256:e6c2d2e42140fd396e96edf75a7ceb11073f4efb4db87565a431cc9d0f93f2e0"},
-    {file = "opentelemetry_api-1.15.0.tar.gz", hash = "sha256:79ab791b4aaad27acc3dc3ba01596db5b5aac2ef75c70622c6038051d6c2cded"},
+    {file = "opentelemetry_api-1.25.0-py3-none-any.whl", hash = "sha256:757fa1aa020a0f8fa139f8959e53dec2051cc26b832e76fa839a6d76ecefd737"},
+    {file = "opentelemetry_api-1.25.0.tar.gz", hash = "sha256:77c4985f62f2614e42ce77ee4c9da5fa5f0bc1e1821085e9a47533a9323ae869"},
 ]
 
 [package.dependencies]
 deprecated = ">=1.2.6"
-setuptools = ">=16.0"
+importlib-metadata = ">=6.0,<=7.1"
 
 [[package]]
 name = "opentelemetry-exporter-otlp"
-version = "1.15.0"
+version = "1.25.0"
 description = "OpenTelemetry Collector Exporters"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_exporter_otlp-1.15.0-py3-none-any.whl", hash = "sha256:79f22748b6a54808a0448093dfa189c8490e729f67c134d4c992533d9393b33e"},
-    {file = "opentelemetry_exporter_otlp-1.15.0.tar.gz", hash = "sha256:4f7c49751d9720e2e726e13b0bb958ccade4e29122c305d92c033da432c8d2c5"},
+    {file = "opentelemetry_exporter_otlp-1.25.0-py3-none-any.whl", hash = "sha256:d67a831757014a3bc3174e4cd629ae1493b7ba8d189e8a007003cacb9f1a6b60"},
+    {file = "opentelemetry_exporter_otlp-1.25.0.tar.gz", hash = "sha256:ce03199c1680a845f82e12c0a6a8f61036048c07ec7a0bd943142aca8fa6ced0"},
 ]
 
 [package.dependencies]
-opentelemetry-exporter-otlp-proto-grpc = "1.15.0"
-opentelemetry-exporter-otlp-proto-http = "1.15.0"
+opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
+opentelemetry-exporter-otlp-proto-http = "1.25.0"
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.25.0"
+description = "OpenTelemetry Protobuf encoding"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "opentelemetry_exporter_otlp_proto_common-1.25.0-py3-none-any.whl", hash = "sha256:15637b7d580c2675f70246563363775b4e6de947871e01d0f4e3881d1848d693"},
+    {file = "opentelemetry_exporter_otlp_proto_common-1.25.0.tar.gz", hash = "sha256:c93f4e30da4eee02bacd1e004eb82ce4da143a2f8e15b987a9f603e0a85407d3"},
+]
+
+[package.dependencies]
+opentelemetry-proto = "1.25.0"
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-grpc"
-version = "1.15.0"
+version = "1.25.0"
 description = "OpenTelemetry Collector Protobuf over gRPC Exporter"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_exporter_otlp_proto_grpc-1.15.0-py3-none-any.whl", hash = "sha256:c2a5492ba7d140109968135d641d06ce3c5bd73c50665f787526065d57d7fd1d"},
-    {file = "opentelemetry_exporter_otlp_proto_grpc-1.15.0.tar.gz", hash = "sha256:844f2a4bb9bcda34e4eb6fe36765e5031aacb36dc60ed88c90fc246942ea26e7"},
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.25.0-py3-none-any.whl", hash = "sha256:3131028f0c0a155a64c430ca600fd658e8e37043cb13209f0109db5c1a3e4eb4"},
+    {file = "opentelemetry_exporter_otlp_proto_grpc-1.25.0.tar.gz", hash = "sha256:c0b1661415acec5af87625587efa1ccab68b873745ca0ee96b69bb1042087eac"},
 ]
 
 [package.dependencies]
-backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""}
+deprecated = ">=1.2.6"
 googleapis-common-protos = ">=1.52,<2.0"
 grpcio = ">=1.0.0,<2.0.0"
-opentelemetry-api = ">=1.12,<2.0"
-opentelemetry-proto = "1.15.0"
-opentelemetry-sdk = ">=1.12,<2.0"
-
-[package.extras]
-test = ["pytest-grpc"]
+opentelemetry-api = ">=1.15,<2.0"
+opentelemetry-exporter-otlp-proto-common = "1.25.0"
+opentelemetry-proto = "1.25.0"
+opentelemetry-sdk = ">=1.25.0,<1.26.0"
 
 [[package]]
 name = "opentelemetry-exporter-otlp-proto-http"
-version = "1.15.0"
+version = "1.25.0"
 description = "OpenTelemetry Collector Protobuf over HTTP Exporter"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_exporter_otlp_proto_http-1.15.0-py3-none-any.whl", hash = "sha256:3ec2a02196c8a54bf5cbf7fe623a5238625638e83b6047a983bdf96e2bbb74c0"},
-    {file = "opentelemetry_exporter_otlp_proto_http-1.15.0.tar.gz", hash = "sha256:11b2c814249a49b22f6cca7a06b05701f561d577b747f3660dfd67b6eb9daf9c"},
+    {file = "opentelemetry_exporter_otlp_proto_http-1.25.0-py3-none-any.whl", hash = "sha256:2eca686ee11b27acd28198b3ea5e5863a53d1266b91cda47c839d95d5e0541a6"},
+    {file = "opentelemetry_exporter_otlp_proto_http-1.25.0.tar.gz", hash = "sha256:9f8723859e37c75183ea7afa73a3542f01d0fd274a5b97487ea24cb683d7d684"},
 ]
 
 [package.dependencies]
-backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""}
+deprecated = ">=1.2.6"
 googleapis-common-protos = ">=1.52,<2.0"
-opentelemetry-api = ">=1.12,<2.0"
-opentelemetry-proto = "1.15.0"
-opentelemetry-sdk = ">=1.12,<2.0"
+opentelemetry-api = ">=1.15,<2.0"
+opentelemetry-exporter-otlp-proto-common = "1.25.0"
+opentelemetry-proto = "1.25.0"
+opentelemetry-sdk = ">=1.25.0,<1.26.0"
 requests = ">=2.7,<3.0"
 
-[package.extras]
-test = ["responses (==0.22.0)"]
-
 [[package]]
 name = "opentelemetry-instrumentation"
-version = "0.36b0"
+version = "0.46b0"
 description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_instrumentation-0.36b0-py3-none-any.whl", hash = "sha256:83ba4ae7d5292b5b33e0f851cc5c76d8f91196b9b3527800fc13855c33383ac2"},
-    {file = "opentelemetry_instrumentation-0.36b0.tar.gz", hash = "sha256:e3ddac9b3b93408ef26c8ecbf38f717042977e16381bb4cd329a5b4cf16998cf"},
+    {file = "opentelemetry_instrumentation-0.46b0-py3-none-any.whl", hash = "sha256:89cd721b9c18c014ca848ccd11181e6b3fd3f6c7669e35d59c48dc527408c18b"},
+    {file = "opentelemetry_instrumentation-0.46b0.tar.gz", hash = "sha256:974e0888fb2a1e01c38fbacc9483d024bb1132aad92d6d24e2e5543887a7adda"},
 ]
 
 [package.dependencies]
@@ -1175,35 +1673,33 @@ wrapt = ">=1.0.0,<2.0.0"
 
 [[package]]
 name = "opentelemetry-instrumentation-grpc"
-version = "0.36b0"
+version = "0.46b0"
 description = "OpenTelemetry gRPC instrumentation"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_instrumentation_grpc-0.36b0-py3-none-any.whl", hash = "sha256:eaa246ed2083c97b13bab2555cb9d170e8433230a31476c4cab8a17fa03380a4"},
-    {file = "opentelemetry_instrumentation_grpc-0.36b0.tar.gz", hash = "sha256:dc89447c9eb6ea868970f6c13b4ffdac182cdd5a41dd215a0f5393ca6375be55"},
+    {file = "opentelemetry_instrumentation_grpc-0.46b0-py3-none-any.whl", hash = "sha256:cccfb28db07c28849709f2dcf330237fae0fca9f86971bfce27b28bb9a8b0577"},
+    {file = "opentelemetry_instrumentation_grpc-0.46b0.tar.gz", hash = "sha256:9c5738592cf82672805099826b676d352324b54e03f9ac72a1368ba0605d6ff9"},
 ]
 
 [package.dependencies]
 opentelemetry-api = ">=1.12,<2.0"
-opentelemetry-instrumentation = "0.36b0"
-opentelemetry-sdk = ">=1.12,<2.0"
-opentelemetry-semantic-conventions = "0.36b0"
+opentelemetry-instrumentation = "0.46b0"
+opentelemetry-semantic-conventions = "0.46b0"
 wrapt = ">=1.0.0,<2.0.0"
 
 [package.extras]
 instruments = ["grpcio (>=1.27,<2.0)"]
-test = ["opentelemetry-instrumentation-grpc[instruments]", "opentelemetry-sdk (>=1.12,<2.0)", "opentelemetry-test-utils (==0.36b0)", "protobuf (>=3.13,<4.0)"]
 
 [[package]]
 name = "opentelemetry-proto"
-version = "1.15.0"
+version = "1.25.0"
 description = "OpenTelemetry Python Proto"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_proto-1.15.0-py3-none-any.whl", hash = "sha256:044b6d044b4d10530f250856f933442b8753a17f94ae37c207607f733fb9a844"},
-    {file = "opentelemetry_proto-1.15.0.tar.gz", hash = "sha256:9c4008e40ac8cab359daac283fbe7002c5c29c77ea2674ad5626a249e64e0101"},
+    {file = "opentelemetry_proto-1.25.0-py3-none-any.whl", hash = "sha256:f07e3341c78d835d9b86665903b199893befa5e98866f63d22b00d0b7ca4972f"},
+    {file = "opentelemetry_proto-1.25.0.tar.gz", hash = "sha256:35b6ef9dc4a9f7853ecc5006738ad40443701e52c26099e197895cbda8b815a3"},
 ]
 
 [package.dependencies]
@@ -1211,319 +1707,521 @@ protobuf = ">=3.19,<5.0"
 
 [[package]]
 name = "opentelemetry-sdk"
-version = "1.15.0"
+version = "1.25.0"
 description = "OpenTelemetry Python SDK"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_sdk-1.15.0-py3-none-any.whl", hash = "sha256:555c533e9837766119bbccc7a80458c9971d853a6f1da683a2246cd5e53b4645"},
-    {file = "opentelemetry_sdk-1.15.0.tar.gz", hash = "sha256:98dbffcfeebcbff12c0c974292d6ea603180a145904cf838b1fe4d5c99078425"},
+    {file = "opentelemetry_sdk-1.25.0-py3-none-any.whl", hash = "sha256:d97ff7ec4b351692e9d5a15af570c693b8715ad78b8aafbec5c7100fe966b4c9"},
+    {file = "opentelemetry_sdk-1.25.0.tar.gz", hash = "sha256:ce7fc319c57707ef5bf8b74fb9f8ebdb8bfafbe11898410e0d2a761d08a98ec7"},
 ]
 
 [package.dependencies]
-opentelemetry-api = "1.15.0"
-opentelemetry-semantic-conventions = "0.36b0"
-setuptools = ">=16.0"
+opentelemetry-api = "1.25.0"
+opentelemetry-semantic-conventions = "0.46b0"
 typing-extensions = ">=3.7.4"
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.36b0"
+version = "0.46b0"
 description = "OpenTelemetry Semantic Conventions"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "opentelemetry_semantic_conventions-0.36b0-py3-none-any.whl", hash = "sha256:adc05635e87b9d3e007c9f530eed487fc3ef2177d02f82f674f28ebf9aff8243"},
-    {file = "opentelemetry_semantic_conventions-0.36b0.tar.gz", hash = "sha256:829dc221795467d98b773c04096e29be038d77526dc8d6ac76f546fb6279bf01"},
+    {file = "opentelemetry_semantic_conventions-0.46b0-py3-none-any.whl", hash = "sha256:6daef4ef9fa51d51855d9f8e0ccd3a1bd59e0e545abe99ac6203804e36ab3e07"},
+    {file = "opentelemetry_semantic_conventions-0.46b0.tar.gz", hash = "sha256:fbc982ecbb6a6e90869b15c1673be90bd18c8a56ff1cffc0864e38e2edffaefa"},
 ]
 
+[package.dependencies]
+opentelemetry-api = "1.25.0"
+
+[[package]]
+name = "outlines"
+version = "0.0.34"
+description = "Probabilistic Generative Model Programming"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
+    {file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
+]
+
+[package.dependencies]
+cloudpickle = "*"
+diskcache = "*"
+interegular = "*"
+jinja2 = "*"
+joblib = "*"
+jsonschema = "*"
+lark = "*"
+nest-asyncio = "*"
+numba = "*"
+numpy = "*"
+pydantic = ">=2.0"
+referencing = "*"
+requests = "*"
+scipy = "*"
+torch = ">=2.1.0"
+transformers = "*"
+
+[package.extras]
+serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
+test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
+
 [[package]]
 name = "packaging"
-version = "23.1"
+version = "24.0"
 description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
-    {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
+    {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"},
+    {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"},
 ]
 
 [[package]]
 name = "pandas"
-version = "2.0.3"
+version = "2.2.2"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"},
-    {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"},
-    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"},
-    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"},
-    {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"},
-    {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"},
-    {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"},
-    {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"},
-    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"},
-    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"},
-    {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"},
-    {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"},
-    {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"},
-    {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"},
-    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"},
-    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"},
-    {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"},
-    {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"},
-    {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"},
-    {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"},
-    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"},
-    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"},
-    {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"},
-    {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"},
-    {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
+    {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
+    {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
+    {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
+    {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
+    {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
-tzdata = ">=2022.1"
+tzdata = ">=2022.7"
 
 [package.extras]
-all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"]
-aws = ["s3fs (>=2021.08.0)"]
-clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"]
-compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"]
-computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"]
-excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"]
-feather = ["pyarrow (>=7.0.0)"]
-fss = ["fsspec (>=2021.07.0)"]
-gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"]
-hdf5 = ["tables (>=3.6.1)"]
-html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"]
-mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"]
-output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"]
-parquet = ["pyarrow (>=7.0.0)"]
-performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"]
-plot = ["matplotlib (>=3.6.1)"]
-postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"]
-spss = ["pyreadstat (>=1.1.2)"]
-sql-other = ["SQLAlchemy (>=1.4.16)"]
-test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
-xml = ["lxml (>=4.6.3)"]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
 
 [[package]]
 name = "peft"
-version = "0.4.0"
+version = "0.10.0"
 description = "Parameter-Efficient Fine-Tuning (PEFT)"
-optional = false
+optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "peft-0.4.0-py3-none-any.whl", hash = "sha256:2cf992772a6d703814477e0bdcdadd68cb8ea388111ce2d793dd2ff0e438f357"},
-    {file = "peft-0.4.0.tar.gz", hash = "sha256:e768fa22d6e9f32aa7e891f0d06f355960278ca4dc0cdd96bff71f6f06269207"},
+    {file = "peft-0.10.0-py3-none-any.whl", hash = "sha256:d5249c97e818d3e31f92553c73c2953acd0ec12649b8b749afff7152cbc86cbb"},
+    {file = "peft-0.10.0.tar.gz", hash = "sha256:36a7628c15f88d37abb26cfc74c22468f9037ee02e9c9b65de943cfe7c672049"},
 ]
 
 [package.dependencies]
-accelerate = "*"
+accelerate = ">=0.21.0"
+huggingface-hub = ">=0.17.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 psutil = "*"
 pyyaml = "*"
 safetensors = "*"
 torch = ">=1.13.0"
+tqdm = "*"
 transformers = "*"
 
 [package.extras]
-dev = ["black (>=22.0,<23.0)", "hf-doc-builder", "ruff (>=0.0.241)", "urllib3 (<=2.0.0)"]
-docs-specific = ["hf-doc-builder"]
-quality = ["black (>=22.0,<23.0)", "ruff (>=0.0.241)", "urllib3 (<=2.0.0)"]
-test = ["black (>=22.0,<23.0)", "datasets", "diffusers", "hf-doc-builder", "parameterized", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.0.241)", "urllib3 (<=2.0.0)"]
+dev = ["black", "hf-doc-builder", "ruff (>=0.2.1,<0.3.0)"]
+docs-specific = ["black", "hf-doc-builder"]
+quality = ["black", "hf-doc-builder", "ruff (>=0.2.1,<0.3.0)"]
+test = ["black", "datasets", "diffusers (<0.21.0)", "hf-doc-builder", "parameterized", "pytest", "pytest-cov", "pytest-xdist", "ruff (>=0.2.1,<0.3.0)", "scipy"]
 
 [[package]]
 name = "pillow"
-version = "10.0.0"
+version = "10.3.0"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"},
-    {file = "Pillow-10.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d5db32e2a6ccbb3d34d87c87b432959e0db29755727afb37290e10f6e8e62614"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edf4392b77bdc81f36e92d3a07a5cd072f90253197f4a52a55a8cec48a12483b"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:520f2a520dc040512699f20fa1c363eed506e94248d71f85412b625026f6142c"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:8c11160913e3dd06c8ffdb5f233a4f254cb449f4dfc0f8f4549eda9e542c93d1"},
-    {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a74ba0c356aaa3bb8e3eb79606a87669e7ec6444be352870623025d75a14a2bf"},
-    {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5d0dae4cfd56969d23d94dc8e89fb6a217be461c69090768227beb8ed28c0a3"},
-    {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22c10cc517668d44b211717fd9775799ccec4124b9a7f7b3635fc5386e584992"},
-    {file = "Pillow-10.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dffe31a7f47b603318c609f378ebcd57f1554a3a6a8effbc59c3c69f804296de"},
-    {file = "Pillow-10.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:9fb218c8a12e51d7ead2a7c9e101a04982237d4855716af2e9499306728fb485"},
-    {file = "Pillow-10.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d35e3c8d9b1268cbf5d3670285feb3528f6680420eafe35cccc686b73c1e330f"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ed64f9ca2f0a95411e88a4efbd7a29e5ce2cea36072c53dd9d26d9c76f753b3"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6eb5502f45a60a3f411c63187db83a3d3107887ad0d036c13ce836f8a36f1d"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c1fbe7621c167ecaa38ad29643d77a9ce7311583761abf7836e1510c580bf3dd"},
-    {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cd25d2a9d2b36fcb318882481367956d2cf91329f6892fe5d385c346c0649629"},
-    {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3b08d4cc24f471b2c8ca24ec060abf4bebc6b144cb89cba638c720546b1cf538"},
-    {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d737a602fbd82afd892ca746392401b634e278cb65d55c4b7a8f48e9ef8d008d"},
-    {file = "Pillow-10.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3a82c40d706d9aa9734289740ce26460a11aeec2d9c79b7af87bb35f0073c12f"},
-    {file = "Pillow-10.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:bc2ec7c7b5d66b8ec9ce9f720dbb5fa4bace0f545acd34870eff4a369b44bf37"},
-    {file = "Pillow-10.0.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:d80cf684b541685fccdd84c485b31ce73fc5c9b5d7523bf1394ce134a60c6883"},
-    {file = "Pillow-10.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76de421f9c326da8f43d690110f0e79fe3ad1e54be811545d7d91898b4c8493e"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81ff539a12457809666fef6624684c008e00ff6bf455b4b89fd00a140eecd640"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce543ed15570eedbb85df19b0a1a7314a9c8141a36ce089c0a894adbfccb4568"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:685ac03cc4ed5ebc15ad5c23bc555d68a87777586d970c2c3e216619a5476223"},
-    {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d72e2ecc68a942e8cf9739619b7f408cc7b272b279b56b2c83c6123fcfa5cdff"},
-    {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d50b6aec14bc737742ca96e85d6d0a5f9bfbded018264b3b70ff9d8c33485551"},
-    {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:00e65f5e822decd501e374b0650146063fbb30a7264b4d2744bdd7b913e0cab5"},
-    {file = "Pillow-10.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:f31f9fdbfecb042d046f9d91270a0ba28368a723302786c0009ee9b9f1f60199"},
-    {file = "Pillow-10.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:1ce91b6ec08d866b14413d3f0bbdea7e24dfdc8e59f562bb77bc3fe60b6144ca"},
-    {file = "Pillow-10.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:349930d6e9c685c089284b013478d6f76e3a534e36ddfa912cde493f235372f3"},
-    {file = "Pillow-10.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3a684105f7c32488f7153905a4e3015a3b6c7182e106fe3c37fbb5ef3e6994c3"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4f69b3700201b80bb82c3a97d5e9254084f6dd5fb5b16fc1a7b974260f89f43"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f07ea8d2f827d7d2a49ecf1639ec02d75ffd1b88dcc5b3a61bbb37a8759ad8d"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:040586f7d37b34547153fa383f7f9aed68b738992380ac911447bb78f2abe530"},
-    {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f88a0b92277de8e3ca715a0d79d68dc82807457dae3ab8699c758f07c20b3c51"},
-    {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c7cf14a27b0d6adfaebb3ae4153f1e516df54e47e42dcc073d7b3d76111a8d86"},
-    {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3400aae60685b06bb96f99a21e1ada7bc7a413d5f49bce739828ecd9391bb8f7"},
-    {file = "Pillow-10.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbc02381779d412145331789b40cc7b11fdf449e5d94f6bc0b080db0a56ea3f0"},
-    {file = "Pillow-10.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9211e7ad69d7c9401cfc0e23d49b69ca65ddd898976d660a2fa5904e3d7a9baa"},
-    {file = "Pillow-10.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:faaf07ea35355b01a35cb442dd950d8f1bb5b040a7787791a535de13db15ed90"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9f72a021fbb792ce98306ffb0c348b3c9cb967dce0f12a49aa4c3d3fdefa967"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f7c16705f44e0504a3a2a14197c1f0b32a95731d251777dcb060aa83022cb2d"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:76edb0a1fa2b4745fb0c99fb9fb98f8b180a1bbceb8be49b087e0b21867e77d3"},
-    {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:368ab3dfb5f49e312231b6f27b8820c823652b7cd29cfbd34090565a015e99ba"},
-    {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:608bfdee0d57cf297d32bcbb3c728dc1da0907519d1784962c5f0c68bb93e5a3"},
-    {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5c6e3df6bdd396749bafd45314871b3d0af81ff935b2d188385e970052091017"},
-    {file = "Pillow-10.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:7be600823e4c8631b74e4a0d38384c73f680e6105a7d3c6824fcf226c178c7e6"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:92be919bbc9f7d09f7ae343c38f5bb21c973d2576c1d45600fce4b74bafa7ac0"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8182b523b2289f7c415f589118228d30ac8c355baa2f3194ced084dac2dbba"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:38250a349b6b390ee6047a62c086d3817ac69022c127f8a5dc058c31ccef17f3"},
-    {file = "Pillow-10.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:88af2003543cc40c80f6fca01411892ec52b11021b3dc22ec3bc9d5afd1c5334"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c189af0545965fa8d3b9613cfdb0cd37f9d71349e0f7750e1fd704648d475ed2"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7b031a6fc11365970e6a5686d7ba8c63e4c1cf1ea143811acbb524295eabed"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:db24668940f82321e746773a4bc617bfac06ec831e5c88b643f91f122a785684"},
-    {file = "Pillow-10.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:efe8c0681042536e0d06c11f48cebe759707c9e9abf880ee213541c5b46c5bf3"},
-    {file = "Pillow-10.0.0.tar.gz", hash = "sha256:9c82b5b3e043c7af0d95792d0d20ccf68f61a1fec6b3530e718b688422727396"},
+    {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"},
+    {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"},
+    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"},
+    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"},
+    {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"},
+    {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"},
+    {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"},
+    {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"},
+    {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"},
+    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"},
+    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"},
+    {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"},
+    {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"},
+    {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"},
+    {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"},
+    {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"},
+    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"},
+    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"},
+    {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"},
+    {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"},
+    {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"},
+    {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"},
+    {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"},
+    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"},
+    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"},
+    {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"},
+    {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"},
+    {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"},
+    {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"},
+    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"},
+    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"},
+    {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"},
+    {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"},
+    {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"},
+    {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"},
 ]
 
 [package.extras]
 docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
+fpx = ["olefile"]
+mic = ["olefile"]
 tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
+typing = ["typing-extensions"]
+xmp = ["defusedxml"]
 
 [[package]]
 name = "pluggy"
-version = "1.3.0"
+version = "1.5.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"},
-    {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"},
+    {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
+    {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
 ]
 
 [package.extras]
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "prometheus-client"
+version = "0.20.0"
+description = "Python client for the Prometheus monitoring system."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "prometheus_client-0.20.0-py3-none-any.whl", hash = "sha256:cde524a85bce83ca359cc837f28b8c0db5cac7aa653a588fd7e84ba061c329e7"},
+    {file = "prometheus_client-0.20.0.tar.gz", hash = "sha256:287629d00b147a32dcb2be0b9df905da599b2d82f80377083ec8463309a4bb89"},
+]
+
+[package.extras]
+twisted = ["twisted"]
+
 [[package]]
 name = "protobuf"
-version = "4.24.2"
+version = "4.25.3"
 description = ""
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "protobuf-4.24.2-cp310-abi3-win32.whl", hash = "sha256:58e12d2c1aa428ece2281cef09bbaa6938b083bcda606db3da4e02e991a0d924"},
-    {file = "protobuf-4.24.2-cp310-abi3-win_amd64.whl", hash = "sha256:77700b55ba41144fc64828e02afb41901b42497b8217b558e4a001f18a85f2e3"},
-    {file = "protobuf-4.24.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:237b9a50bd3b7307d0d834c1b0eb1a6cd47d3f4c2da840802cd03ea288ae8880"},
-    {file = "protobuf-4.24.2-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:25ae91d21e3ce8d874211110c2f7edd6384816fb44e06b2867afe35139e1fd1c"},
-    {file = "protobuf-4.24.2-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:c00c3c7eb9ad3833806e21e86dca448f46035242a680f81c3fe068ff65e79c74"},
-    {file = "protobuf-4.24.2-cp37-cp37m-win32.whl", hash = "sha256:4e69965e7e54de4db989289a9b971a099e626f6167a9351e9d112221fc691bc1"},
-    {file = "protobuf-4.24.2-cp37-cp37m-win_amd64.whl", hash = "sha256:c5cdd486af081bf752225b26809d2d0a85e575b80a84cde5172a05bbb1990099"},
-    {file = "protobuf-4.24.2-cp38-cp38-win32.whl", hash = "sha256:6bd26c1fa9038b26c5c044ee77e0ecb18463e957fefbaeb81a3feb419313a54e"},
-    {file = "protobuf-4.24.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb7aa97c252279da65584af0456f802bd4b2de429eb945bbc9b3d61a42a8cd16"},
-    {file = "protobuf-4.24.2-cp39-cp39-win32.whl", hash = "sha256:2b23bd6e06445699b12f525f3e92a916f2dcf45ffba441026357dea7fa46f42b"},
-    {file = "protobuf-4.24.2-cp39-cp39-win_amd64.whl", hash = "sha256:839952e759fc40b5d46be319a265cf94920174d88de31657d5622b5d8d6be5cd"},
-    {file = "protobuf-4.24.2-py3-none-any.whl", hash = "sha256:3b7b170d3491ceed33f723bbf2d5a260f8a4e23843799a3906f16ef736ef251e"},
-    {file = "protobuf-4.24.2.tar.gz", hash = "sha256:7fda70797ddec31ddfa3576cbdcc3ddbb6b3078b737a1a87ab9136af0570cd6e"},
+    {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"},
+    {file = "protobuf-4.25.3-cp310-abi3-win_amd64.whl", hash = "sha256:209ba4cc916bab46f64e56b85b090607a676f66b473e6b762e6f1d9d591eb2e8"},
+    {file = "protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f1279ab38ecbfae7e456a108c5c0681e4956d5b1090027c1de0f934dfdb4b35c"},
+    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:e7cb0ae90dd83727f0c0718634ed56837bfeeee29a5f82a7514c03ee1364c019"},
+    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d"},
+    {file = "protobuf-4.25.3-cp38-cp38-win32.whl", hash = "sha256:f4f118245c4a087776e0a8408be33cf09f6c547442c00395fbfb116fac2f8ac2"},
+    {file = "protobuf-4.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:c053062984e61144385022e53678fbded7aea14ebb3e0305ae3592fb219ccfa4"},
+    {file = "protobuf-4.25.3-cp39-cp39-win32.whl", hash = "sha256:19b270aeaa0099f16d3ca02628546b8baefe2955bbe23224aaf856134eccf1e4"},
+    {file = "protobuf-4.25.3-cp39-cp39-win_amd64.whl", hash = "sha256:e3c97a1555fd6388f857770ff8b9703083de6bf1f9274a002a332d65fbb56c8c"},
+    {file = "protobuf-4.25.3-py3-none-any.whl", hash = "sha256:f0700d54bcf45424477e46a9f0944155b46fb0639d69728739c0e47bab83f2b9"},
+    {file = "protobuf-4.25.3.tar.gz", hash = "sha256:25b5d0b42fd000320bd7830b349e3b696435f3b329810427a6bcce6a5492cc5c"},
 ]
 
 [[package]]
 name = "psutil"
-version = "5.9.5"
+version = "5.9.8"
 description = "Cross-platform lib for process and system monitoring in Python."
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
-    {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"},
-    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"},
-    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"},
-    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"},
-    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"},
-    {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"},
-    {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"},
-    {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"},
-    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"},
-    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"},
-    {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"},
-    {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"},
-    {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"},
-    {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"},
+    {file = "psutil-5.9.8-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8"},
+    {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73"},
+    {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7"},
+    {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36"},
+    {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d"},
+    {file = "psutil-5.9.8-cp27-none-win32.whl", hash = "sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e"},
+    {file = "psutil-5.9.8-cp27-none-win_amd64.whl", hash = "sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631"},
+    {file = "psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81"},
+    {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421"},
+    {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4"},
+    {file = "psutil-5.9.8-cp36-cp36m-win32.whl", hash = "sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee"},
+    {file = "psutil-5.9.8-cp36-cp36m-win_amd64.whl", hash = "sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2"},
+    {file = "psutil-5.9.8-cp37-abi3-win32.whl", hash = "sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0"},
+    {file = "psutil-5.9.8-cp37-abi3-win_amd64.whl", hash = "sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf"},
+    {file = "psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8"},
+    {file = "psutil-5.9.8.tar.gz", hash = "sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c"},
 ]
 
 [package.extras]
 test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
+[[package]]
+name = "py-cpuinfo"
+version = "9.0.0"
+description = "Get CPU info with pure Python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"},
+    {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"},
+]
+
 [[package]]
 name = "pyarrow"
-version = "13.0.0"
+version = "16.1.0"
 description = "Python library for Apache Arrow"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-13.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:1afcc2c33f31f6fb25c92d50a86b7a9f076d38acbcb6f9e74349636109550148"},
-    {file = "pyarrow-13.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:70fa38cdc66b2fc1349a082987f2b499d51d072faaa6b600f71931150de2e0e3"},
-    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd57b13a6466822498238877892a9b287b0a58c2e81e4bdb0b596dbb151cbb73"},
-    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ce69f7bf01de2e2764e14df45b8404fc6f1a5ed9871e8e08a12169f87b7a26"},
-    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:588f0d2da6cf1b1680974d63be09a6530fd1bd825dc87f76e162404779a157dc"},
-    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6241afd72b628787b4abea39e238e3ff9f34165273fad306c7acf780dd850956"},
-    {file = "pyarrow-13.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:fda7857e35993673fcda603c07d43889fca60a5b254052a462653f8656c64f44"},
-    {file = "pyarrow-13.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:aac0ae0146a9bfa5e12d87dda89d9ef7c57a96210b899459fc2f785303dcbb67"},
-    {file = "pyarrow-13.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d7759994217c86c161c6a8060509cfdf782b952163569606bb373828afdd82e8"},
-    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:868a073fd0ff6468ae7d869b5fc1f54de5c4255b37f44fb890385eb68b68f95d"},
-    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51be67e29f3cfcde263a113c28e96aa04362ed8229cb7c6e5f5c719003659d33"},
-    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d1b4e7176443d12610874bb84d0060bf080f000ea9ed7c84b2801df851320295"},
-    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:69b6f9a089d116a82c3ed819eea8fe67dae6105f0d81eaf0fdd5e60d0c6e0944"},
-    {file = "pyarrow-13.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ab1268db81aeb241200e321e220e7cd769762f386f92f61b898352dd27e402ce"},
-    {file = "pyarrow-13.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:ee7490f0f3f16a6c38f8c680949551053c8194e68de5046e6c288e396dccee80"},
-    {file = "pyarrow-13.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3ad79455c197a36eefbd90ad4aa832bece7f830a64396c15c61a0985e337287"},
-    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68fcd2dc1b7d9310b29a15949cdd0cb9bc34b6de767aff979ebf546020bf0ba0"},
-    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc6fd330fd574c51d10638e63c0d00ab456498fc804c9d01f2a61b9264f2c5b2"},
-    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:e66442e084979a97bb66939e18f7b8709e4ac5f887e636aba29486ffbf373763"},
-    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:0f6eff839a9e40e9c5610d3ff8c5bdd2f10303408312caf4c8003285d0b49565"},
-    {file = "pyarrow-13.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b30a27f1cddf5c6efcb67e598d7823a1e253d743d92ac32ec1eb4b6a1417867"},
-    {file = "pyarrow-13.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:09552dad5cf3de2dc0aba1c7c4b470754c69bd821f5faafc3d774bedc3b04bb7"},
-    {file = "pyarrow-13.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3896ae6c205d73ad192d2fc1489cd0edfab9f12867c85b4c277af4d37383c18c"},
-    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6647444b21cb5e68b593b970b2a9a07748dd74ea457c7dadaa15fd469c48ada1"},
-    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47663efc9c395e31d09c6aacfa860f4473815ad6804311c5433f7085415d62a7"},
-    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:b9ba6b6d34bd2563345488cf444510588ea42ad5613df3b3509f48eb80250afd"},
-    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:d00d374a5625beeb448a7fa23060df79adb596074beb3ddc1838adb647b6ef09"},
-    {file = "pyarrow-13.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:c51afd87c35c8331b56f796eff954b9c7f8d4b7fef5903daf4e05fcf017d23a8"},
-    {file = "pyarrow-13.0.0.tar.gz", hash = "sha256:83333726e83ed44b0ac94d8d7a21bbdee4a05029c3b1e8db58a863eec8fd8a33"},
+    {file = "pyarrow-16.1.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:17e23b9a65a70cc733d8b738baa6ad3722298fa0c81d88f63ff94bf25eaa77b9"},
+    {file = "pyarrow-16.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4740cc41e2ba5d641071d0ab5e9ef9b5e6e8c7611351a5cb7c1d175eaf43674a"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98100e0268d04e0eec47b73f20b39c45b4006f3c4233719c3848aa27a03c1aef"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f68f409e7b283c085f2da014f9ef81e885d90dcd733bd648cfba3ef265961848"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a8914cd176f448e09746037b0c6b3a9d7688cef451ec5735094055116857580c"},
+    {file = "pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:48be160782c0556156d91adbdd5a4a7e719f8d407cb46ae3bb4eaee09b3111bd"},
+    {file = "pyarrow-16.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9cf389d444b0f41d9fe1444b70650fea31e9d52cfcb5f818b7888b91b586efff"},
+    {file = "pyarrow-16.1.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:d0ebea336b535b37eee9eee31761813086d33ed06de9ab6fc6aaa0bace7b250c"},
+    {file = "pyarrow-16.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e73cfc4a99e796727919c5541c65bb88b973377501e39b9842ea71401ca6c1c"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9251264247ecfe93e5f5a0cd43b8ae834f1e61d1abca22da55b20c788417f6"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddf5aace92d520d3d2a20031d8b0ec27b4395cab9f74e07cc95edf42a5cc0147"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:25233642583bf658f629eb230b9bb79d9af4d9f9229890b3c878699c82f7d11e"},
+    {file = "pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a33a64576fddfbec0a44112eaf844c20853647ca833e9a647bfae0582b2ff94b"},
+    {file = "pyarrow-16.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:185d121b50836379fe012753cf15c4ba9638bda9645183ab36246923875f8d1b"},
+    {file = "pyarrow-16.1.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:2e51ca1d6ed7f2e9d5c3c83decf27b0d17bb207a7dea986e8dc3e24f80ff7d6f"},
+    {file = "pyarrow-16.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:06ebccb6f8cb7357de85f60d5da50e83507954af617d7b05f48af1621d331c9a"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b04707f1979815f5e49824ce52d1dceb46e2f12909a48a6a753fe7cafbc44a0c"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d32000693deff8dc5df444b032b5985a48592c0697cb6e3071a5d59888714e2"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8785bb10d5d6fd5e15d718ee1d1f914fe768bf8b4d1e5e9bf253de8a26cb1628"},
+    {file = "pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:e1369af39587b794873b8a307cc6623a3b1194e69399af0efd05bb202195a5a7"},
+    {file = "pyarrow-16.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:febde33305f1498f6df85e8020bca496d0e9ebf2093bab9e0f65e2b4ae2b3444"},
+    {file = "pyarrow-16.1.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:b5f5705ab977947a43ac83b52ade3b881eb6e95fcc02d76f501d549a210ba77f"},
+    {file = "pyarrow-16.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0d27bf89dfc2576f6206e9cd6cf7a107c9c06dc13d53bbc25b0bd4556f19cf5f"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d07de3ee730647a600037bc1d7b7994067ed64d0eba797ac74b2bc77384f4c2"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbef391b63f708e103df99fbaa3acf9f671d77a183a07546ba2f2c297b361e83"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19741c4dbbbc986d38856ee7ddfdd6a00fc3b0fc2d928795b95410d38bb97d15"},
+    {file = "pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f2c5fb249caa17b94e2b9278b36a05ce03d3180e6da0c4c3b3ce5b2788f30eed"},
+    {file = "pyarrow-16.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:e6b6d3cd35fbb93b70ade1336022cc1147b95ec6af7d36906ca7fe432eb09710"},
+    {file = "pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:18da9b76a36a954665ccca8aa6bd9f46c1145f79c0bb8f4f244f5f8e799bca55"},
+    {file = "pyarrow-16.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:99f7549779b6e434467d2aa43ab2b7224dd9e41bdde486020bae198978c9e05e"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f07fdffe4fd5b15f5ec15c8b64584868d063bc22b86b46c9695624ca3505b7b4"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfe389a08ea374972bd4065d5f25d14e36b43ebc22fc75f7b951f24378bf0b5"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b20bd67c94b3a2ea0a749d2a5712fc845a69cb5d52e78e6449bbd295611f3aa"},
+    {file = "pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:ba8ac20693c0bb0bf4b238751d4409e62852004a8cf031c73b0e0962b03e45e3"},
+    {file = "pyarrow-16.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:31a1851751433d89a986616015841977e0a188662fcffd1a5677453f1df2de0a"},
+    {file = "pyarrow-16.1.0.tar.gz", hash = "sha256:15fbb22ea96d11f0b5768504a3f961edab25eaf4197c341720c4a387f6c60315"},
 ]
 
 [package.dependencies]
 numpy = ">=1.16.6"
 
+[[package]]
+name = "pydantic"
+version = "2.7.3"
+description = "Data validation using Python type hints"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic-2.7.3-py3-none-any.whl", hash = "sha256:ea91b002777bf643bb20dd717c028ec43216b24a6001a280f83877fd2655d0b4"},
+    {file = "pydantic-2.7.3.tar.gz", hash = "sha256:c46c76a40bb1296728d7a8b99aa73dd70a48c3510111ff290034f860c99c419e"},
+]
+
+[package.dependencies]
+annotated-types = ">=0.4.0"
+pydantic-core = "2.18.4"
+typing-extensions = ">=4.6.1"
+
+[package.extras]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.18.4"
+description = "Core functionality for Pydantic validation and serialization"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"},
+    {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"},
+    {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"},
+    {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"},
+    {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"},
+    {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"},
+    {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"},
+    {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"},
+    {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"},
+    {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"},
+    {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"},
+    {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"},
+    {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"},
+    {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"},
+    {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"},
+    {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"},
+    {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"},
+    {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"},
+    {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"},
+    {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"},
+    {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
+
 [[package]]
 name = "pytest"
-version = "7.4.0"
+version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
-    {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
 ]
 
 [package.dependencies]
@@ -1539,13 +2237,13 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no
 
 [[package]]
 name = "python-dateutil"
-version = "2.8.2"
+version = "2.9.0.post0"
 description = "Extensions to the standard Python datetime module"
 optional = true
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 files = [
-    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
-    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
+    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
 ]
 
 [package.dependencies]
@@ -1553,13 +2251,13 @@ six = ">=1.5"
 
 [[package]]
 name = "pytz"
-version = "2023.3"
+version = "2024.1"
 description = "World timezone definitions, modern and historical"
 optional = true
 python-versions = "*"
 files = [
-    {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"},
-    {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
+    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
+    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
 ]
 
 [[package]]
@@ -1587,6 +2285,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -1621,112 +2320,118 @@ files = [
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
+[[package]]
+name = "referencing"
+version = "0.35.1"
+description = "JSON Referencing + Python"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"},
+    {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"},
+]
+
+[package.dependencies]
+attrs = ">=22.2.0"
+rpds-py = ">=0.7.0"
+
 [[package]]
 name = "regex"
-version = "2023.8.8"
+version = "2024.5.15"
 description = "Alternative regular expression module, to replace re."
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "regex-2023.8.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:88900f521c645f784260a8d346e12a1590f79e96403971241e64c3a265c8ecdb"},
-    {file = "regex-2023.8.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3611576aff55918af2697410ff0293d6071b7e00f4b09e005d614686ac4cd57c"},
-    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8a0ccc8f2698f120e9e5742f4b38dc944c38744d4bdfc427616f3a163dd9de5"},
-    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c662a4cbdd6280ee56f841f14620787215a171c4e2d1744c9528bed8f5816c96"},
-    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf0633e4a1b667bfe0bb10b5e53fe0d5f34a6243ea2530eb342491f1adf4f739"},
-    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:551ad543fa19e94943c5b2cebc54c73353ffff08228ee5f3376bd27b3d5b9800"},
-    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54de2619f5ea58474f2ac211ceea6b615af2d7e4306220d4f3fe690c91988a61"},
-    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5ec4b3f0aebbbe2fc0134ee30a791af522a92ad9f164858805a77442d7d18570"},
-    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ae646c35cb9f820491760ac62c25b6d6b496757fda2d51be429e0e7b67ae0ab"},
-    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca339088839582d01654e6f83a637a4b8194d0960477b9769d2ff2cfa0fa36d2"},
-    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:d9b6627408021452dcd0d2cdf8da0534e19d93d070bfa8b6b4176f99711e7f90"},
-    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:bd3366aceedf274f765a3a4bc95d6cd97b130d1dda524d8f25225d14123c01db"},
-    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7aed90a72fc3654fba9bc4b7f851571dcc368120432ad68b226bd593f3f6c0b7"},
-    {file = "regex-2023.8.8-cp310-cp310-win32.whl", hash = "sha256:80b80b889cb767cc47f31d2b2f3dec2db8126fbcd0cff31b3925b4dc6609dcdb"},
-    {file = "regex-2023.8.8-cp310-cp310-win_amd64.whl", hash = "sha256:b82edc98d107cbc7357da7a5a695901b47d6eb0420e587256ba3ad24b80b7d0b"},
-    {file = "regex-2023.8.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1e7d84d64c84ad97bf06f3c8cb5e48941f135ace28f450d86af6b6512f1c9a71"},
-    {file = "regex-2023.8.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce0f9fbe7d295f9922c0424a3637b88c6c472b75eafeaff6f910494a1fa719ef"},
-    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06c57e14ac723b04458df5956cfb7e2d9caa6e9d353c0b4c7d5d54fcb1325c46"},
-    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7a9aaa5a1267125eef22cef3b63484c3241aaec6f48949b366d26c7250e0357"},
-    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b7408511fca48a82a119d78a77c2f5eb1b22fe88b0d2450ed0756d194fe7a9a"},
-    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14dc6f2d88192a67d708341f3085df6a4f5a0c7b03dec08d763ca2cd86e9f559"},
-    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48c640b99213643d141550326f34f0502fedb1798adb3c9eb79650b1ecb2f177"},
-    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0085da0f6c6393428bf0d9c08d8b1874d805bb55e17cb1dfa5ddb7cfb11140bf"},
-    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:964b16dcc10c79a4a2be9f1273fcc2684a9eedb3906439720598029a797b46e6"},
-    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7ce606c14bb195b0e5108544b540e2c5faed6843367e4ab3deb5c6aa5e681208"},
-    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:40f029d73b10fac448c73d6eb33d57b34607f40116e9f6e9f0d32e9229b147d7"},
-    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3b8e6ea6be6d64104d8e9afc34c151926f8182f84e7ac290a93925c0db004bfd"},
-    {file = "regex-2023.8.8-cp311-cp311-win32.whl", hash = "sha256:942f8b1f3b223638b02df7df79140646c03938d488fbfb771824f3d05fc083a8"},
-    {file = "regex-2023.8.8-cp311-cp311-win_amd64.whl", hash = "sha256:51d8ea2a3a1a8fe4f67de21b8b93757005213e8ac3917567872f2865185fa7fb"},
-    {file = "regex-2023.8.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e951d1a8e9963ea51efd7f150450803e3b95db5939f994ad3d5edac2b6f6e2b4"},
-    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704f63b774218207b8ccc6c47fcef5340741e5d839d11d606f70af93ee78e4d4"},
-    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22283c769a7b01c8ac355d5be0715bf6929b6267619505e289f792b01304d898"},
-    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91129ff1bb0619bc1f4ad19485718cc623a2dc433dff95baadbf89405c7f6b57"},
-    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de35342190deb7b866ad6ba5cbcccb2d22c0487ee0cbb251efef0843d705f0d4"},
-    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b993b6f524d1e274a5062488a43e3f9f8764ee9745ccd8e8193df743dbe5ee61"},
-    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3026cbcf11d79095a32d9a13bbc572a458727bd5b1ca332df4a79faecd45281c"},
-    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:293352710172239bf579c90a9864d0df57340b6fd21272345222fb6371bf82b3"},
-    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:d909b5a3fff619dc7e48b6b1bedc2f30ec43033ba7af32f936c10839e81b9217"},
-    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3d370ff652323c5307d9c8e4c62efd1956fb08051b0e9210212bc51168b4ff56"},
-    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:b076da1ed19dc37788f6a934c60adf97bd02c7eea461b73730513921a85d4235"},
-    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e9941a4ada58f6218694f382e43fdd256e97615db9da135e77359da257a7168b"},
-    {file = "regex-2023.8.8-cp36-cp36m-win32.whl", hash = "sha256:a8c65c17aed7e15a0c824cdc63a6b104dfc530f6fa8cb6ac51c437af52b481c7"},
-    {file = "regex-2023.8.8-cp36-cp36m-win_amd64.whl", hash = "sha256:aadf28046e77a72f30dcc1ab185639e8de7f4104b8cb5c6dfa5d8ed860e57236"},
-    {file = "regex-2023.8.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:423adfa872b4908843ac3e7a30f957f5d5282944b81ca0a3b8a7ccbbfaa06103"},
-    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ae594c66f4a7e1ea67232a0846649a7c94c188d6c071ac0210c3e86a5f92109"},
-    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e51c80c168074faa793685656c38eb7a06cbad7774c8cbc3ea05552d615393d8"},
-    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09b7f4c66aa9d1522b06e31a54f15581c37286237208df1345108fcf4e050c18"},
-    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e73e5243af12d9cd6a9d6a45a43570dbe2e5b1cdfc862f5ae2b031e44dd95a8"},
-    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:941460db8fe3bd613db52f05259c9336f5a47ccae7d7def44cc277184030a116"},
-    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f0ccf3e01afeb412a1a9993049cb160d0352dba635bbca7762b2dc722aa5742a"},
-    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2e9216e0d2cdce7dbc9be48cb3eacb962740a09b011a116fd7af8c832ab116ca"},
-    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:5cd9cd7170459b9223c5e592ac036e0704bee765706445c353d96f2890e816c8"},
-    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4873ef92e03a4309b3ccd8281454801b291b689f6ad45ef8c3658b6fa761d7ac"},
-    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:239c3c2a339d3b3ddd51c2daef10874410917cd2b998f043c13e2084cb191684"},
-    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1005c60ed7037be0d9dea1f9c53cc42f836188227366370867222bda4c3c6bd7"},
-    {file = "regex-2023.8.8-cp37-cp37m-win32.whl", hash = "sha256:e6bd1e9b95bc5614a7a9c9c44fde9539cba1c823b43a9f7bc11266446dd568e3"},
-    {file = "regex-2023.8.8-cp37-cp37m-win_amd64.whl", hash = "sha256:9a96edd79661e93327cfeac4edec72a4046e14550a1d22aa0dd2e3ca52aec921"},
-    {file = "regex-2023.8.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675"},
-    {file = "regex-2023.8.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a2ad5add903eb7cdde2b7c64aaca405f3957ab34f16594d2b78d53b8b1a6a7d6"},
-    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9233ac249b354c54146e392e8a451e465dd2d967fc773690811d3a8c240ac601"},
-    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:920974009fb37b20d32afcdf0227a2e707eb83fe418713f7a8b7de038b870d0b"},
-    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2b6c5dfe0929b6c23dde9624483380b170b6e34ed79054ad131b20203a1a63"},
-    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96979d753b1dc3b2169003e1854dc67bfc86edf93c01e84757927f810b8c3c93"},
-    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ae54a338191e1356253e7883d9d19f8679b6143703086245fb14d1f20196be9"},
-    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2162ae2eb8b079622176a81b65d486ba50b888271302190870b8cc488587d280"},
-    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c884d1a59e69e03b93cf0dfee8794c63d7de0ee8f7ffb76e5f75be8131b6400a"},
-    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf9273e96f3ee2ac89ffcb17627a78f78e7516b08f94dc435844ae72576a276e"},
-    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:83215147121e15d5f3a45d99abeed9cf1fe16869d5c233b08c56cdf75f43a504"},
-    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3f7454aa427b8ab9101f3787eb178057c5250478e39b99540cfc2b889c7d0586"},
-    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0640913d2c1044d97e30d7c41728195fc37e54d190c5385eacb52115127b882"},
-    {file = "regex-2023.8.8-cp38-cp38-win32.whl", hash = "sha256:0c59122ceccb905a941fb23b087b8eafc5290bf983ebcb14d2301febcbe199c7"},
-    {file = "regex-2023.8.8-cp38-cp38-win_amd64.whl", hash = "sha256:c12f6f67495ea05c3d542d119d270007090bad5b843f642d418eb601ec0fa7be"},
-    {file = "regex-2023.8.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:82cd0a69cd28f6cc3789cc6adeb1027f79526b1ab50b1f6062bbc3a0ccb2dbc3"},
-    {file = "regex-2023.8.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb34d1605f96a245fc39790a117ac1bac8de84ab7691637b26ab2c5efb8f228c"},
-    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:987b9ac04d0b38ef4f89fbc035e84a7efad9cdd5f1e29024f9289182c8d99e09"},
-    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dd6082f4e2aec9b6a0927202c85bc1b09dcab113f97265127c1dc20e2e32495"},
-    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7eb95fe8222932c10d4436e7a6f7c99991e3fdd9f36c949eff16a69246dee2dc"},
-    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7098c524ba9f20717a56a8d551d2ed491ea89cbf37e540759ed3b776a4f8d6eb"},
-    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b694430b3f00eb02c594ff5a16db30e054c1b9589a043fe9174584c6efa8033"},
-    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2aeab3895d778155054abea5238d0eb9a72e9242bd4b43f42fd911ef9a13470"},
-    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:988631b9d78b546e284478c2ec15c8a85960e262e247b35ca5eaf7ee22f6050a"},
-    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:67ecd894e56a0c6108ec5ab1d8fa8418ec0cff45844a855966b875d1039a2e34"},
-    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:14898830f0a0eb67cae2bbbc787c1a7d6e34ecc06fbd39d3af5fe29a4468e2c9"},
-    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf"},
-    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9691a549c19c22d26a4f3b948071e93517bdf86e41b81d8c6ac8a964bb71e5a6"},
-    {file = "regex-2023.8.8-cp39-cp39-win32.whl", hash = "sha256:6ab2ed84bf0137927846b37e882745a827458689eb969028af8032b1b3dac78e"},
-    {file = "regex-2023.8.8-cp39-cp39-win_amd64.whl", hash = "sha256:5543c055d8ec7801901e1193a51570643d6a6ab8751b1f7dd9af71af467538bb"},
-    {file = "regex-2023.8.8.tar.gz", hash = "sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e"},
+    {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f"},
+    {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b59138b219ffa8979013be7bc85bb60c6f7b7575df3d56dc1e403a438c7a3f6"},
+    {file = "regex-2024.5.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a0bd000c6e266927cb7a1bc39d55be95c4b4f65c5be53e659537537e019232b1"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5eaa7ddaf517aa095fa8da0b5015c44d03da83f5bd49c87961e3c997daed0de7"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba68168daedb2c0bab7fd7e00ced5ba90aebf91024dea3c88ad5063c2a562cca"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6e8d717bca3a6e2064fc3a08df5cbe366369f4b052dcd21b7416e6d71620dca1"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f9ebd0a36102fcad2f03696e8af4ae682793a5d30b46c647eaf280d6cfb32796"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9efa1a32ad3a3ea112224897cdaeb6aa00381627f567179c0314f7b65d354c62"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1595f2d10dff3d805e054ebdc41c124753631b6a471b976963c7b28543cf13b0"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b802512f3e1f480f41ab5f2cfc0e2f761f08a1f41092d6718868082fc0d27143"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a0981022dccabca811e8171f913de05720590c915b033b7e601f35ce4ea7019f"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:19068a6a79cf99a19ccefa44610491e9ca02c2be3305c7760d3831d38a467a6f"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1b5269484f6126eee5e687785e83c6b60aad7663dafe842b34691157e5083e53"},
+    {file = "regex-2024.5.15-cp310-cp310-win32.whl", hash = "sha256:ada150c5adfa8fbcbf321c30c751dc67d2f12f15bd183ffe4ec7cde351d945b3"},
+    {file = "regex-2024.5.15-cp310-cp310-win_amd64.whl", hash = "sha256:ac394ff680fc46b97487941f5e6ae49a9f30ea41c6c6804832063f14b2a5a145"},
+    {file = "regex-2024.5.15-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f5b1dff3ad008dccf18e652283f5e5339d70bf8ba7c98bf848ac33db10f7bc7a"},
+    {file = "regex-2024.5.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c6a2b494a76983df8e3d3feea9b9ffdd558b247e60b92f877f93a1ff43d26656"},
+    {file = "regex-2024.5.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a32b96f15c8ab2e7d27655969a23895eb799de3665fa94349f3b2fbfd547236f"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10002e86e6068d9e1c91eae8295ef690f02f913c57db120b58fdd35a6bb1af35"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec54d5afa89c19c6dd8541a133be51ee1017a38b412b1321ccb8d6ddbeb4cf7d"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10e4ce0dca9ae7a66e6089bb29355d4432caed736acae36fef0fdd7879f0b0cb"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e507ff1e74373c4d3038195fdd2af30d297b4f0950eeda6f515ae3d84a1770f"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1f059a4d795e646e1c37665b9d06062c62d0e8cc3c511fe01315973a6542e40"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0721931ad5fe0dda45d07f9820b90b2148ccdd8e45bb9e9b42a146cb4f695649"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:833616ddc75ad595dee848ad984d067f2f31be645d603e4d158bba656bbf516c"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:287eb7f54fc81546346207c533ad3c2c51a8d61075127d7f6d79aaf96cdee890"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:19dfb1c504781a136a80ecd1fff9f16dddf5bb43cec6871778c8a907a085bb3d"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:119af6e56dce35e8dfb5222573b50c89e5508d94d55713c75126b753f834de68"},
+    {file = "regex-2024.5.15-cp311-cp311-win32.whl", hash = "sha256:1c1c174d6ec38d6c8a7504087358ce9213d4332f6293a94fbf5249992ba54efa"},
+    {file = "regex-2024.5.15-cp311-cp311-win_amd64.whl", hash = "sha256:9e717956dcfd656f5055cc70996ee2cc82ac5149517fc8e1b60261b907740201"},
+    {file = "regex-2024.5.15-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:632b01153e5248c134007209b5c6348a544ce96c46005d8456de1d552455b014"},
+    {file = "regex-2024.5.15-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e64198f6b856d48192bf921421fdd8ad8eb35e179086e99e99f711957ffedd6e"},
+    {file = "regex-2024.5.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68811ab14087b2f6e0fc0c2bae9ad689ea3584cad6917fc57be6a48bbd012c49"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8ec0c2fea1e886a19c3bee0cd19d862b3aa75dcdfb42ebe8ed30708df64687a"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0c0c0003c10f54a591d220997dd27d953cd9ccc1a7294b40a4be5312be8797b"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2431b9e263af1953c55abbd3e2efca67ca80a3de8a0437cb58e2421f8184717a"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a605586358893b483976cffc1723fb0f83e526e8f14c6e6614e75919d9862cf"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391d7f7f1e409d192dba8bcd42d3e4cf9e598f3979cdaed6ab11288da88cb9f2"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9ff11639a8d98969c863d4617595eb5425fd12f7c5ef6621a4b74b71ed8726d5"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4eee78a04e6c67e8391edd4dad3279828dd66ac4b79570ec998e2155d2e59fd5"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8fe45aa3f4aa57faabbc9cb46a93363edd6197cbc43523daea044e9ff2fea83e"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d0a3d8d6acf0c78a1fff0e210d224b821081330b8524e3e2bc5a68ef6ab5803d"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c486b4106066d502495b3025a0a7251bf37ea9540433940a23419461ab9f2a80"},
+    {file = "regex-2024.5.15-cp312-cp312-win32.whl", hash = "sha256:c49e15eac7c149f3670b3e27f1f28a2c1ddeccd3a2812cba953e01be2ab9b5fe"},
+    {file = "regex-2024.5.15-cp312-cp312-win_amd64.whl", hash = "sha256:673b5a6da4557b975c6c90198588181029c60793835ce02f497ea817ff647cb2"},
+    {file = "regex-2024.5.15-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:87e2a9c29e672fc65523fb47a90d429b70ef72b901b4e4b1bd42387caf0d6835"},
+    {file = "regex-2024.5.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c3bea0ba8b73b71b37ac833a7f3fd53825924165da6a924aec78c13032f20850"},
+    {file = "regex-2024.5.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bfc4f82cabe54f1e7f206fd3d30fda143f84a63fe7d64a81558d6e5f2e5aaba9"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5bb9425fe881d578aeca0b2b4b3d314ec88738706f66f219c194d67179337cb"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64c65783e96e563103d641760664125e91bd85d8e49566ee560ded4da0d3e704"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf2430df4148b08fb4324b848672514b1385ae3807651f3567871f130a728cc3"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5397de3219a8b08ae9540c48f602996aa6b0b65d5a61683e233af8605c42b0f2"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:455705d34b4154a80ead722f4f185b04c4237e8e8e33f265cd0798d0e44825fa"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2b6f1b3bb6f640c1a92be3bbfbcb18657b125b99ecf141fb3310b5282c7d4ed"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3ad070b823ca5890cab606c940522d05d3d22395d432f4aaaf9d5b1653e47ced"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5b5467acbfc153847d5adb21e21e29847bcb5870e65c94c9206d20eb4e99a384"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:e6662686aeb633ad65be2a42b4cb00178b3fbf7b91878f9446075c404ada552f"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:2b4c884767504c0e2401babe8b5b7aea9148680d2e157fa28f01529d1f7fcf67"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3cd7874d57f13bf70078f1ff02b8b0aa48d5b9ed25fc48547516c6aba36f5741"},
+    {file = "regex-2024.5.15-cp38-cp38-win32.whl", hash = "sha256:e4682f5ba31f475d58884045c1a97a860a007d44938c4c0895f41d64481edbc9"},
+    {file = "regex-2024.5.15-cp38-cp38-win_amd64.whl", hash = "sha256:d99ceffa25ac45d150e30bd9ed14ec6039f2aad0ffa6bb87a5936f5782fc1569"},
+    {file = "regex-2024.5.15-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:13cdaf31bed30a1e1c2453ef6015aa0983e1366fad2667657dbcac7b02f67133"},
+    {file = "regex-2024.5.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cac27dcaa821ca271855a32188aa61d12decb6fe45ffe3e722401fe61e323cd1"},
+    {file = "regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7dbe2467273b875ea2de38ded4eba86cbcbc9a1a6d0aa11dcf7bd2e67859c435"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64f18a9a3513a99c4bef0e3efd4c4a5b11228b48aa80743be822b71e132ae4f5"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d347a741ea871c2e278fde6c48f85136c96b8659b632fb57a7d1ce1872547600"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1878b8301ed011704aea4c806a3cadbd76f84dece1ec09cc9e4dc934cfa5d4da"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4babf07ad476aaf7830d77000874d7611704a7fcf68c9c2ad151f5d94ae4bfc4"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:35cb514e137cb3488bce23352af3e12fb0dbedd1ee6e60da053c69fb1b29cc6c"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cdd09d47c0b2efee9378679f8510ee6955d329424c659ab3c5e3a6edea696294"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:72d7a99cd6b8f958e85fc6ca5b37c4303294954eac1376535b03c2a43eb72629"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a094801d379ab20c2135529948cb84d417a2169b9bdceda2a36f5f10977ebc16"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c0c18345010870e58238790a6779a1219b4d97bd2e77e1140e8ee5d14df071aa"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:16093f563098448ff6b1fa68170e4acbef94e6b6a4e25e10eae8598bb1694b5d"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e38a7d4e8f633a33b4c7350fbd8bad3b70bf81439ac67ac38916c4a86b465456"},
+    {file = "regex-2024.5.15-cp39-cp39-win32.whl", hash = "sha256:71a455a3c584a88f654b64feccc1e25876066c4f5ef26cd6dd711308aa538694"},
+    {file = "regex-2024.5.15-cp39-cp39-win_amd64.whl", hash = "sha256:cab12877a9bdafde5500206d1020a584355a97884dfd388af3699e9137bf7388"},
+    {file = "regex-2024.5.15.tar.gz", hash = "sha256:d3ee02d9e5f482cc8309134a91eeaacbdd2261ba111b0fef3748eeb4913e6a2c"},
 ]
 
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
 ]
 
 [package.dependencies]
@@ -1740,125 +2445,276 @@ socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
-name = "safetensors"
-version = "0.3.3"
-description = "Fast and Safe Tensor serialization"
-optional = false
-python-versions = "*"
+name = "rpds-py"
+version = "0.18.1"
+description = "Python bindings to Rust's persistent data structures (rpds)"
+optional = true
+python-versions = ">=3.8"
 files = [
-    {file = "safetensors-0.3.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:92e4d0c8b2836120fddd134474c5bda8963f322333941f8b9f643e5b24f041eb"},
-    {file = "safetensors-0.3.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:3dcadb6153c42addc9c625a622ebde9293fabe1973f9ef31ba10fb42c16e8536"},
-    {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:08f26b61e1b0a14dc959aa9d568776bd038805f611caef1de04a80c468d4a7a4"},
-    {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"},
-    {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"},
-    {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"},
-    {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"},
-    {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"},
-    {file = "safetensors-0.3.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:cbc3312f134baf07334dd517341a4b470b2931f090bd9284888acb7dfaf4606f"},
-    {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d15030af39d5d30c22bcbc6d180c65405b7ea4c05b7bab14a570eac7d7d43722"},
-    {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"},
-    {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"},
-    {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"},
-    {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"},
-    {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"},
-    {file = "safetensors-0.3.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:2fff5b19a1b462c17322998b2f4b8bce43c16fe208968174d2f3a1446284ceed"},
-    {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"},
-    {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"},
-    {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"},
-    {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"},
-    {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"},
-    {file = "safetensors-0.3.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8530399666748634bc0b301a6a5523756931b0c2680d188e743d16304afe917a"},
-    {file = "safetensors-0.3.3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:9d741c1f1621e489ba10aa3d135b54202684f6e205df52e219d5eecd673a80c9"},
-    {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:0c345fd85b4d2093a5109596ff4cd9dfc2e84992e881b4857fbc4a93a3b89ddb"},
-    {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"},
-    {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"},
-    {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"},
-    {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"},
-    {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"},
-    {file = "safetensors-0.3.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:59a596b3225c96d59af412385981f17dd95314e3fffdf359c7e3f5bb97730a19"},
-    {file = "safetensors-0.3.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:82a16e92210a6221edd75ab17acdd468dd958ef5023d9c6c1289606cc30d1479"},
-    {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:98a929e763a581f516373ef31983ed1257d2d0da912a8e05d5cd12e9e441c93a"},
-    {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"},
-    {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"},
-    {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"},
-    {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"},
-    {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"},
-    {file = "safetensors-0.3.3.tar.gz", hash = "sha256:edb7072d788c4f929d0f5735d3a2fb51e5a27f833587828583b7f5747af1a2b8"},
+    {file = "rpds_py-0.18.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:d31dea506d718693b6b2cffc0648a8929bdc51c70a311b2770f09611caa10d53"},
+    {file = "rpds_py-0.18.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:732672fbc449bab754e0b15356c077cc31566df874964d4801ab14f71951ea80"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a98a1f0552b5f227a3d6422dbd61bc6f30db170939bd87ed14f3c339aa6c7c9"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f1944ce16401aad1e3f7d312247b3d5de7981f634dc9dfe90da72b87d37887d"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38e14fb4e370885c4ecd734f093a2225ee52dc384b86fa55fe3f74638b2cfb09"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08d74b184f9ab6289b87b19fe6a6d1a97fbfea84b8a3e745e87a5de3029bf944"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d70129cef4a8d979caa37e7fe957202e7eee8ea02c5e16455bc9808a59c6b2f0"},
+    {file = "rpds_py-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0bb20e3a11bd04461324a6a798af34d503f8d6f1aa3d2aa8901ceaf039176d"},
+    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:81c5196a790032e0fc2464c0b4ab95f8610f96f1f2fa3d4deacce6a79852da60"},
+    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f3027be483868c99b4985fda802a57a67fdf30c5d9a50338d9db646d590198da"},
+    {file = "rpds_py-0.18.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d44607f98caa2961bab4fa3c4309724b185b464cdc3ba6f3d7340bac3ec97cc1"},
+    {file = "rpds_py-0.18.1-cp310-none-win32.whl", hash = "sha256:c273e795e7a0f1fddd46e1e3cb8be15634c29ae8ff31c196debb620e1edb9333"},
+    {file = "rpds_py-0.18.1-cp310-none-win_amd64.whl", hash = "sha256:8352f48d511de5f973e4f2f9412736d7dea76c69faa6d36bcf885b50c758ab9a"},
+    {file = "rpds_py-0.18.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6b5ff7e1d63a8281654b5e2896d7f08799378e594f09cf3674e832ecaf396ce8"},
+    {file = "rpds_py-0.18.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8927638a4d4137a289e41d0fd631551e89fa346d6dbcfc31ad627557d03ceb6d"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:154bf5c93d79558b44e5b50cc354aa0459e518e83677791e6adb0b039b7aa6a7"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07f2139741e5deb2c5154a7b9629bc5aa48c766b643c1a6750d16f865a82c5fc"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c7672e9fba7425f79019db9945b16e308ed8bc89348c23d955c8c0540da0a07"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:489bdfe1abd0406eba6b3bb4fdc87c7fa40f1031de073d0cfb744634cc8fa261"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c20f05e8e3d4fc76875fc9cb8cf24b90a63f5a1b4c5b9273f0e8225e169b100"},
+    {file = "rpds_py-0.18.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:967342e045564cef76dfcf1edb700b1e20838d83b1aa02ab313e6a497cf923b8"},
+    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cc7c1a47f3a63282ab0f422d90ddac4aa3034e39fc66a559ab93041e6505da7"},
+    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f7afbfee1157e0f9376c00bb232e80a60e59ed716e3211a80cb8506550671e6e"},
+    {file = "rpds_py-0.18.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9e6934d70dc50f9f8ea47081ceafdec09245fd9f6032669c3b45705dea096b88"},
+    {file = "rpds_py-0.18.1-cp311-none-win32.whl", hash = "sha256:c69882964516dc143083d3795cb508e806b09fc3800fd0d4cddc1df6c36e76bb"},
+    {file = "rpds_py-0.18.1-cp311-none-win_amd64.whl", hash = "sha256:70a838f7754483bcdc830444952fd89645569e7452e3226de4a613a4c1793fb2"},
+    {file = "rpds_py-0.18.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3dd3cd86e1db5aadd334e011eba4e29d37a104b403e8ca24dcd6703c68ca55b3"},
+    {file = "rpds_py-0.18.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f3d615099bd9b13ecf2fc9cf2d839ad3f20239c678f461c753e93755d629ee"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35b2b771b13eee8729a5049c976197ff58a27a3829c018a04341bcf1ae409b2b"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ee17cd26b97d537af8f33635ef38be873073d516fd425e80559f4585a7b90c43"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b646bf655b135ccf4522ed43d6902af37d3f5dbcf0da66c769a2b3938b9d8184"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19ba472b9606c36716062c023afa2484d1e4220548751bda14f725a7de17b4f6"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e30ac5e329098903262dc5bdd7e2086e0256aa762cc8b744f9e7bf2a427d3f8"},
+    {file = "rpds_py-0.18.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d58ad6317d188c43750cb76e9deacf6051d0f884d87dc6518e0280438648a9ac"},
+    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e1735502458621921cee039c47318cb90b51d532c2766593be6207eec53e5c4c"},
+    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f5bab211605d91db0e2995a17b5c6ee5edec1270e46223e513eaa20da20076ac"},
+    {file = "rpds_py-0.18.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2fc24a329a717f9e2448f8cd1f960f9dac4e45b6224d60734edeb67499bab03a"},
+    {file = "rpds_py-0.18.1-cp312-none-win32.whl", hash = "sha256:1805d5901779662d599d0e2e4159d8a82c0b05faa86ef9222bf974572286b2b6"},
+    {file = "rpds_py-0.18.1-cp312-none-win_amd64.whl", hash = "sha256:720edcb916df872d80f80a1cc5ea9058300b97721efda8651efcd938a9c70a72"},
+    {file = "rpds_py-0.18.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:c827576e2fa017a081346dce87d532a5310241648eb3700af9a571a6e9fc7e74"},
+    {file = "rpds_py-0.18.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aa3679e751408d75a0b4d8d26d6647b6d9326f5e35c00a7ccd82b78ef64f65f8"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0abeee75434e2ee2d142d650d1e54ac1f8b01e6e6abdde8ffd6eeac6e9c38e20"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed402d6153c5d519a0faf1bb69898e97fb31613b49da27a84a13935ea9164dfc"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:338dee44b0cef8b70fd2ef54b4e09bb1b97fc6c3a58fea5db6cc083fd9fc2724"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7750569d9526199c5b97e5a9f8d96a13300950d910cf04a861d96f4273d5b104"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:607345bd5912aacc0c5a63d45a1f73fef29e697884f7e861094e443187c02be5"},
+    {file = "rpds_py-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:207c82978115baa1fd8d706d720b4a4d2b0913df1c78c85ba73fe6c5804505f0"},
+    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:6d1e42d2735d437e7e80bab4d78eb2e459af48c0a46e686ea35f690b93db792d"},
+    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5463c47c08630007dc0fe99fb480ea4f34a89712410592380425a9b4e1611d8e"},
+    {file = "rpds_py-0.18.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:06d218939e1bf2ca50e6b0ec700ffe755e5216a8230ab3e87c059ebb4ea06afc"},
+    {file = "rpds_py-0.18.1-cp38-none-win32.whl", hash = "sha256:312fe69b4fe1ffbe76520a7676b1e5ac06ddf7826d764cc10265c3b53f96dbe9"},
+    {file = "rpds_py-0.18.1-cp38-none-win_amd64.whl", hash = "sha256:9437ca26784120a279f3137ee080b0e717012c42921eb07861b412340f85bae2"},
+    {file = "rpds_py-0.18.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:19e515b78c3fc1039dd7da0a33c28c3154458f947f4dc198d3c72db2b6b5dc93"},
+    {file = "rpds_py-0.18.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7b28c5b066bca9a4eb4e2f2663012debe680f097979d880657f00e1c30875a0"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:673fdbbf668dd958eff750e500495ef3f611e2ecc209464f661bc82e9838991e"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d960de62227635d2e61068f42a6cb6aae91a7fe00fca0e3aeed17667c8a34611"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:352a88dc7892f1da66b6027af06a2e7e5d53fe05924cc2cfc56495b586a10b72"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e0ee01ad8260184db21468a6e1c37afa0529acc12c3a697ee498d3c2c4dcaf3"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c39ad2f512b4041343ea3c7894339e4ca7839ac38ca83d68a832fc8b3748ab"},
+    {file = "rpds_py-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aaa71ee43a703c321906813bb252f69524f02aa05bf4eec85f0c41d5d62d0f4c"},
+    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6cd8098517c64a85e790657e7b1e509b9fe07487fd358e19431cb120f7d96338"},
+    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:4adec039b8e2928983f885c53b7cc4cda8965b62b6596501a0308d2703f8af1b"},
+    {file = "rpds_py-0.18.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:32b7daaa3e9389db3695964ce8e566e3413b0c43e3394c05e4b243a4cd7bef26"},
+    {file = "rpds_py-0.18.1-cp39-none-win32.whl", hash = "sha256:2625f03b105328729f9450c8badda34d5243231eef6535f80064d57035738360"},
+    {file = "rpds_py-0.18.1-cp39-none-win_amd64.whl", hash = "sha256:bf18932d0003c8c4d51a39f244231986ab23ee057d235a12b2684ea26a353590"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cbfbea39ba64f5e53ae2915de36f130588bba71245b418060ec3330ebf85678e"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3d456ff2a6a4d2adcdf3c1c960a36f4fd2fec6e3b4902a42a384d17cf4e7a65"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7700936ef9d006b7ef605dc53aa364da2de5a3aa65516a1f3ce73bf82ecfc7ae"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:51584acc5916212e1bf45edd17f3a6b05fe0cbb40482d25e619f824dccb679de"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:942695a206a58d2575033ff1e42b12b2aece98d6003c6bc739fbf33d1773b12f"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b906b5f58892813e5ba5c6056d6a5ad08f358ba49f046d910ad992196ea61397"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6f8e3fecca256fefc91bb6765a693d96692459d7d4c644660a9fff32e517843"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7732770412bab81c5a9f6d20aeb60ae943a9b36dcd990d876a773526468e7163"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:bd1105b50ede37461c1d51b9698c4f4be6e13e69a908ab7751e3807985fc0346"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:618916f5535784960f3ecf8111581f4ad31d347c3de66d02e728de460a46303c"},
+    {file = "rpds_py-0.18.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:17c6d2155e2423f7e79e3bb18151c686d40db42d8645e7977442170c360194d4"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6c4c4c3f878df21faf5fac86eda32671c27889e13570645a9eea0a1abdd50922"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:fab6ce90574645a0d6c58890e9bcaac8d94dff54fb51c69e5522a7358b80ab64"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531796fb842b53f2695e94dc338929e9f9dbf473b64710c28af5a160b2a8927d"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:740884bc62a5e2bbb31e584f5d23b32320fd75d79f916f15a788d527a5e83644"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:998125738de0158f088aef3cb264a34251908dd2e5d9966774fdab7402edfab7"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e2be6e9dd4111d5b31ba3b74d17da54a8319d8168890fbaea4b9e5c3de630ae5"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0cee71bc618cd93716f3c1bf56653740d2d13ddbd47673efa8bf41435a60daa"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c3caec4ec5cd1d18e5dd6ae5194d24ed12785212a90b37f5f7f06b8bedd7139"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:27bba383e8c5231cd559affe169ca0b96ec78d39909ffd817f28b166d7ddd4d8"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:a888e8bdb45916234b99da2d859566f1e8a1d2275a801bb8e4a9644e3c7e7909"},
+    {file = "rpds_py-0.18.1-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6031b25fb1b06327b43d841f33842b383beba399884f8228a6bb3df3088485ff"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48c2faaa8adfacefcbfdb5f2e2e7bdad081e5ace8d182e5f4ade971f128e6bb3"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d85164315bd68c0806768dc6bb0429c6f95c354f87485ee3593c4f6b14def2bd"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6afd80f6c79893cfc0574956f78a0add8c76e3696f2d6a15bca2c66c415cf2d4"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa242ac1ff583e4ec7771141606aafc92b361cd90a05c30d93e343a0c2d82a89"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21be4770ff4e08698e1e8e0bce06edb6ea0626e7c8f560bc08222880aca6a6f"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c45a639e93a0c5d4b788b2613bd637468edd62f8f95ebc6fcc303d58ab3f0a8"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:910e71711d1055b2768181efa0a17537b2622afeb0424116619817007f8a2b10"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b9bb1f182a97880f6078283b3505a707057c42bf55d8fca604f70dedfdc0772a"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1d54f74f40b1f7aaa595a02ff42ef38ca654b1469bef7d52867da474243cc633"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:8d2e182c9ee01135e11e9676e9a62dfad791a7a467738f06726872374a83db49"},
+    {file = "rpds_py-0.18.1-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:636a15acc588f70fda1661234761f9ed9ad79ebed3f2125d44be0862708b666e"},
+    {file = "rpds_py-0.18.1.tar.gz", hash = "sha256:dc48b479d540770c811fbd1eb9ba2bb66951863e448efec2e2c102625328e92f"},
+]
+
+[[package]]
+name = "safetensors"
+version = "0.4.3"
+description = ""
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "safetensors-0.4.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:dcf5705cab159ce0130cd56057f5f3425023c407e170bca60b4868048bae64fd"},
+    {file = "safetensors-0.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bb4f8c5d0358a31e9a08daeebb68f5e161cdd4018855426d3f0c23bb51087055"},
+    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70a5319ef409e7f88686a46607cbc3c428271069d8b770076feaf913664a07ac"},
+    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fb9c65bd82f9ef3ce4970dc19ee86be5f6f93d032159acf35e663c6bea02b237"},
+    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:edb5698a7bc282089f64c96c477846950358a46ede85a1c040e0230344fdde10"},
+    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:efcc860be094b8d19ac61b452ec635c7acb9afa77beb218b1d7784c6d41fe8ad"},
+    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376"},
+    {file = "safetensors-0.4.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5fc6775529fb9f0ce2266edd3e5d3f10aab068e49f765e11f6f2a63b5367021d"},
+    {file = "safetensors-0.4.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9c6ad011c1b4e3acff058d6b090f1da8e55a332fbf84695cf3100c649cc452d1"},
+    {file = "safetensors-0.4.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c496c5401c1b9c46d41a7688e8ff5b0310a3b9bae31ce0f0ae870e1ea2b8caf"},
+    {file = "safetensors-0.4.3-cp310-none-win32.whl", hash = "sha256:38e2a8666178224a51cca61d3cb4c88704f696eac8f72a49a598a93bbd8a4af9"},
+    {file = "safetensors-0.4.3-cp310-none-win_amd64.whl", hash = "sha256:393e6e391467d1b2b829c77e47d726f3b9b93630e6a045b1d1fca67dc78bf632"},
+    {file = "safetensors-0.4.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:22f3b5d65e440cec0de8edaa672efa888030802e11c09b3d6203bff60ebff05a"},
+    {file = "safetensors-0.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c4fa560ebd4522adddb71dcd25d09bf211b5634003f015a4b815b7647d62ebe"},
+    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9afd5358719f1b2cf425fad638fc3c887997d6782da317096877e5b15b2ce93"},
+    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d8c5093206ef4b198600ae484230402af6713dab1bd5b8e231905d754022bec7"},
+    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0b2104df1579d6ba9052c0ae0e3137c9698b2d85b0645507e6fd1813b70931a"},
+    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8cf18888606dad030455d18f6c381720e57fc6a4170ee1966adb7ebc98d4d6a3"},
+    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0bf4f9d6323d9f86eef5567eabd88f070691cf031d4c0df27a40d3b4aaee755b"},
+    {file = "safetensors-0.4.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:585c9ae13a205807b63bef8a37994f30c917ff800ab8a1ca9c9b5d73024f97ee"},
+    {file = "safetensors-0.4.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:faefeb3b81bdfb4e5a55b9bbdf3d8d8753f65506e1d67d03f5c851a6c87150e9"},
+    {file = "safetensors-0.4.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:befdf0167ad626f22f6aac6163477fcefa342224a22f11fdd05abb3995c1783c"},
+    {file = "safetensors-0.4.3-cp311-none-win32.whl", hash = "sha256:a7cef55929dcbef24af3eb40bedec35d82c3c2fa46338bb13ecf3c5720af8a61"},
+    {file = "safetensors-0.4.3-cp311-none-win_amd64.whl", hash = "sha256:840b7ac0eff5633e1d053cc9db12fdf56b566e9403b4950b2dc85393d9b88d67"},
+    {file = "safetensors-0.4.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:22d21760dc6ebae42e9c058d75aa9907d9f35e38f896e3c69ba0e7b213033856"},
+    {file = "safetensors-0.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d22c1a10dff3f64d0d68abb8298a3fd88ccff79f408a3e15b3e7f637ef5c980"},
+    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1648568667f820b8c48317c7006221dc40aced1869908c187f493838a1362bc"},
+    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:446e9fe52c051aeab12aac63d1017e0f68a02a92a027b901c4f8e931b24e5397"},
+    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fef5d70683643618244a4f5221053567ca3e77c2531e42ad48ae05fae909f542"},
+    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a1f4430cc0c9d6afa01214a4b3919d0a029637df8e09675ceef1ca3f0dfa0df"},
+    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d603846a8585b9432a0fd415db1d4c57c0f860eb4aea21f92559ff9902bae4d"},
+    {file = "safetensors-0.4.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a844cdb5d7cbc22f5f16c7e2a0271170750763c4db08381b7f696dbd2c78a361"},
+    {file = "safetensors-0.4.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:88887f69f7a00cf02b954cdc3034ffb383b2303bc0ab481d4716e2da51ddc10e"},
+    {file = "safetensors-0.4.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee463219d9ec6c2be1d331ab13a8e0cd50d2f32240a81d498266d77d07b7e71e"},
+    {file = "safetensors-0.4.3-cp312-none-win32.whl", hash = "sha256:d0dd4a1db09db2dba0f94d15addc7e7cd3a7b0d393aa4c7518c39ae7374623c3"},
+    {file = "safetensors-0.4.3-cp312-none-win_amd64.whl", hash = "sha256:d14d30c25897b2bf19b6fb5ff7e26cc40006ad53fd4a88244fdf26517d852dd7"},
+    {file = "safetensors-0.4.3-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d1456f814655b224d4bf6e7915c51ce74e389b413be791203092b7ff78c936dd"},
+    {file = "safetensors-0.4.3-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:455d538aa1aae4a8b279344a08136d3f16334247907b18a5c3c7fa88ef0d3c46"},
+    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf476bca34e1340ee3294ef13e2c625833f83d096cfdf69a5342475602004f95"},
+    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:02ef3a24face643456020536591fbd3c717c5abaa2737ec428ccbbc86dffa7a4"},
+    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7de32d0d34b6623bb56ca278f90db081f85fb9c5d327e3c18fd23ac64f465768"},
+    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a0deb16a1d3ea90c244ceb42d2c6c276059616be21a19ac7101aa97da448faf"},
+    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c59d51f182c729f47e841510b70b967b0752039f79f1de23bcdd86462a9b09ee"},
+    {file = "safetensors-0.4.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1f598b713cc1a4eb31d3b3203557ac308acf21c8f41104cdd74bf640c6e538e3"},
+    {file = "safetensors-0.4.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5757e4688f20df083e233b47de43845d1adb7e17b6cf7da5f8444416fc53828d"},
+    {file = "safetensors-0.4.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fe746d03ed8d193674a26105e4f0fe6c726f5bb602ffc695b409eaf02f04763d"},
+    {file = "safetensors-0.4.3-cp37-none-win32.whl", hash = "sha256:0d5ffc6a80f715c30af253e0e288ad1cd97a3d0086c9c87995e5093ebc075e50"},
+    {file = "safetensors-0.4.3-cp37-none-win_amd64.whl", hash = "sha256:a11c374eb63a9c16c5ed146457241182f310902bd2a9c18255781bb832b6748b"},
+    {file = "safetensors-0.4.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:b1e31be7945f66be23f4ec1682bb47faa3df34cb89fc68527de6554d3c4258a4"},
+    {file = "safetensors-0.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:03a4447c784917c9bf01d8f2ac5080bc15c41692202cd5f406afba16629e84d6"},
+    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d244bcafeb1bc06d47cfee71727e775bca88a8efda77a13e7306aae3813fa7e4"},
+    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:53c4879b9c6bd7cd25d114ee0ef95420e2812e676314300624594940a8d6a91f"},
+    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74707624b81f1b7f2b93f5619d4a9f00934d5948005a03f2c1845ffbfff42212"},
+    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0d52c958dc210265157573f81d34adf54e255bc2b59ded6218500c9b15a750eb"},
+    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f9568f380f513a60139971169c4a358b8731509cc19112369902eddb33faa4d"},
+    {file = "safetensors-0.4.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0d9cd8e1560dfc514b6d7859247dc6a86ad2f83151a62c577428d5102d872721"},
+    {file = "safetensors-0.4.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:89f9f17b0dacb913ed87d57afbc8aad85ea42c1085bd5de2f20d83d13e9fc4b2"},
+    {file = "safetensors-0.4.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1139eb436fd201c133d03c81209d39ac57e129f5e74e34bb9ab60f8d9b726270"},
+    {file = "safetensors-0.4.3-cp38-none-win32.whl", hash = "sha256:d9c289f140a9ae4853fc2236a2ffc9a9f2d5eae0cb673167e0f1b8c18c0961ac"},
+    {file = "safetensors-0.4.3-cp38-none-win_amd64.whl", hash = "sha256:622afd28968ef3e9786562d352659a37de4481a4070f4ebac883f98c5836563e"},
+    {file = "safetensors-0.4.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8651c7299cbd8b4161a36cd6a322fa07d39cd23535b144d02f1c1972d0c62f3c"},
+    {file = "safetensors-0.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e375d975159ac534c7161269de24ddcd490df2157b55c1a6eeace6cbb56903f0"},
+    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:084fc436e317f83f7071fc6a62ca1c513b2103db325cd09952914b50f51cf78f"},
+    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:41a727a7f5e6ad9f1db6951adee21bbdadc632363d79dc434876369a17de6ad6"},
+    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7dbbde64b6c534548696808a0e01276d28ea5773bc9a2dfb97a88cd3dffe3df"},
+    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bbae3b4b9d997971431c346edbfe6e41e98424a097860ee872721e176040a893"},
+    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01e4b22e3284cd866edeabe4f4d896229495da457229408d2e1e4810c5187121"},
+    {file = "safetensors-0.4.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dd37306546b58d3043eb044c8103a02792cc024b51d1dd16bd3dd1f334cb3ed"},
+    {file = "safetensors-0.4.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d8815b5e1dac85fc534a97fd339e12404db557878c090f90442247e87c8aeaea"},
+    {file = "safetensors-0.4.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e011cc162503c19f4b1fd63dfcddf73739c7a243a17dac09b78e57a00983ab35"},
+    {file = "safetensors-0.4.3-cp39-none-win32.whl", hash = "sha256:01feb3089e5932d7e662eda77c3ecc389f97c0883c4a12b5cfdc32b589a811c3"},
+    {file = "safetensors-0.4.3-cp39-none-win_amd64.whl", hash = "sha256:3f9cdca09052f585e62328c1c2923c70f46814715c795be65f0b93f57ec98a02"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:1b89381517891a7bb7d1405d828b2bf5d75528299f8231e9346b8eba092227f9"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:cd6fff9e56df398abc5866b19a32124815b656613c1c5ec0f9350906fd798aac"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:840caf38d86aa7014fe37ade5d0d84e23dcfbc798b8078015831996ecbc206a3"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9650713b2cfa9537a2baf7dd9fee458b24a0aaaa6cafcea8bdd5fb2b8efdc34"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e4119532cd10dba04b423e0f86aecb96cfa5a602238c0aa012f70c3a40c44b50"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e066e8861eef6387b7c772344d1fe1f9a72800e04ee9a54239d460c400c72aab"},
+    {file = "safetensors-0.4.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:90964917f5b0fa0fa07e9a051fbef100250c04d150b7026ccbf87a34a54012e0"},
+    {file = "safetensors-0.4.3-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c41e1893d1206aa7054029681778d9a58b3529d4c807002c156d58426c225173"},
+    {file = "safetensors-0.4.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7613a119a71a497d012ccc83775c308b9c1dab454806291427f84397d852fd"},
+    {file = "safetensors-0.4.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9bac020faba7f5dc481e881b14b6425265feabb5bfc552551d21189c0eddc3"},
+    {file = "safetensors-0.4.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:420a98f593ff9930f5822560d14c395ccbc57342ddff3b463bc0b3d6b1951550"},
+    {file = "safetensors-0.4.3-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f5e6883af9a68c0028f70a4c19d5a6ab6238a379be36ad300a22318316c00cb0"},
+    {file = "safetensors-0.4.3-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:cdd0a3b5da66e7f377474599814dbf5cbf135ff059cc73694de129b58a5e8a2c"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9bfb92f82574d9e58401d79c70c716985dc049b635fef6eecbb024c79b2c46ad"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:3615a96dd2dcc30eb66d82bc76cda2565f4f7bfa89fcb0e31ba3cea8a1a9ecbb"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:868ad1b6fc41209ab6bd12f63923e8baeb1a086814cb2e81a65ed3d497e0cf8f"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7ffba80aa49bd09195145a7fd233a7781173b422eeb995096f2b30591639517"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0acbe31340ab150423347e5b9cc595867d814244ac14218932a5cf1dd38eb39"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:19bbdf95de2cf64f25cd614c5236c8b06eb2cfa47cbf64311f4b5d80224623a3"},
+    {file = "safetensors-0.4.3-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b852e47eb08475c2c1bd8131207b405793bfc20d6f45aff893d3baaad449ed14"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5d07cbca5b99babb692d76d8151bec46f461f8ad8daafbfd96b2fca40cadae65"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1ab6527a20586d94291c96e00a668fa03f86189b8a9defa2cdd34a1a01acc7d5"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02318f01e332cc23ffb4f6716e05a492c5f18b1d13e343c49265149396284a44"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec4b52ce9a396260eb9731eb6aea41a7320de22ed73a1042c2230af0212758ce"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:018b691383026a2436a22b648873ed11444a364324e7088b99cd2503dd828400"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:309b10dbcab63269ecbf0e2ca10ce59223bb756ca5d431ce9c9eeabd446569da"},
+    {file = "safetensors-0.4.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b277482120df46e27a58082df06a15aebda4481e30a1c21eefd0921ae7e03f65"},
+    {file = "safetensors-0.4.3.tar.gz", hash = "sha256:2f85fc50c4e07a21e95c24e07460fe6f7e2859d0ce88092838352b798ce711c2"},
 ]
 
 [package.extras]
-all = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "flax (>=0.6.3)", "h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "isort (>=5.5.4)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)", "tensorflow (==2.11.0)", "torch (>=1.10)"]
-dev = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "flax (>=0.6.3)", "h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "isort (>=5.5.4)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)", "tensorflow (==2.11.0)", "torch (>=1.10)"]
-jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)"]
+all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"]
+dev = ["safetensors[all]"]
+jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[numpy]"]
+mlx = ["mlx (>=0.0.9)"]
 numpy = ["numpy (>=1.21.6)"]
-paddlepaddle = ["numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)"]
-pinned-tf = ["tensorflow (==2.11.0)"]
+paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
+pinned-tf = ["safetensors[numpy]", "tensorflow (==2.11.0)"]
 quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
-tensorflow = ["numpy (>=1.21.6)", "tensorflow (>=2.11.0)"]
-testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "numpy (>=1.21.6)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)"]
-torch = ["numpy (>=1.21.6)", "torch (>=1.10)"]
+tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
+testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
+torch = ["safetensors[numpy]", "torch (>=1.10)"]
 
 [[package]]
 name = "scipy"
-version = "1.11.2"
+version = "1.13.1"
 description = "Fundamental algorithms for scientific computing in Python"
 optional = false
-python-versions = "<3.13,>=3.9"
+python-versions = ">=3.9"
 files = [
-    {file = "scipy-1.11.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2b997a5369e2d30c97995dcb29d638701f8000d04df01b8e947f206e5d0ac788"},
-    {file = "scipy-1.11.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:95763fbda1206bec41157582bea482f50eb3702c85fffcf6d24394b071c0e87a"},
-    {file = "scipy-1.11.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e367904a0fec76433bf3fbf3e85bf60dae8e9e585ffd21898ab1085a29a04d16"},
-    {file = "scipy-1.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d690e1ca993c8f7ede6d22e5637541217fc6a4d3f78b3672a6fe454dbb7eb9a7"},
-    {file = "scipy-1.11.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d2b813bfbe8dec6a75164523de650bad41f4405d35b0fa24c2c28ae07fcefb20"},
-    {file = "scipy-1.11.2-cp310-cp310-win_amd64.whl", hash = "sha256:afdb0d983f6135d50770dd979df50bf1c7f58b5b33e0eb8cf5c73c70600eae1d"},
-    {file = "scipy-1.11.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8d9886f44ef8c9e776cb7527fb01455bf4f4a46c455c4682edc2c2cc8cd78562"},
-    {file = "scipy-1.11.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1342ca385c673208f32472830c10110a9dcd053cf0c4b7d4cd7026d0335a6c1d"},
-    {file = "scipy-1.11.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b133f237bd8ba73bad51bc12eb4f2d84cbec999753bf25ba58235e9fc2096d80"},
-    {file = "scipy-1.11.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aeb87661de987f8ec56fa6950863994cd427209158255a389fc5aea51fa7055"},
-    {file = "scipy-1.11.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:90d3b1364e751d8214e325c371f0ee0dd38419268bf4888b2ae1040a6b266b2a"},
-    {file = "scipy-1.11.2-cp311-cp311-win_amd64.whl", hash = "sha256:f73102f769ee06041a3aa26b5841359b1a93cc364ce45609657751795e8f4a4a"},
-    {file = "scipy-1.11.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fa4909c6c20c3d91480533cddbc0e7c6d849e7d9ded692918c76ce5964997898"},
-    {file = "scipy-1.11.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ac74b1512d38718fb6a491c439aa7b3605b96b1ed3be6599c17d49d6c60fca18"},
-    {file = "scipy-1.11.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8425fa963a32936c9773ee3ce44a765d8ff67eed5f4ac81dc1e4a819a238ee9"},
-    {file = "scipy-1.11.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:542a757e2a6ec409e71df3d8fd20127afbbacb1c07990cb23c5870c13953d899"},
-    {file = "scipy-1.11.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ea932570b1c2a30edafca922345854ff2cd20d43cd9123b6dacfdecebfc1a80b"},
-    {file = "scipy-1.11.2-cp312-cp312-win_amd64.whl", hash = "sha256:4447ad057d7597476f9862ecbd9285bbf13ba9d73ce25acfa4e4b11c6801b4c9"},
-    {file = "scipy-1.11.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b0620240ef445b5ddde52460e6bc3483b7c9c750275369379e5f609a1050911c"},
-    {file = "scipy-1.11.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:f28f1f6cfeb48339c192efc6275749b2a25a7e49c4d8369a28b6591da02fbc9a"},
-    {file = "scipy-1.11.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:214cdf04bbae7a54784f8431f976704ed607c4bc69ba0d5d5d6a9df84374df76"},
-    {file = "scipy-1.11.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10eb6af2f751aa3424762948e5352f707b0dece77288206f227864ddf675aca0"},
-    {file = "scipy-1.11.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0f3261f14b767b316d7137c66cc4f33a80ea05841b9c87ad83a726205b901423"},
-    {file = "scipy-1.11.2-cp39-cp39-win_amd64.whl", hash = "sha256:2c91cf049ffb5575917f2a01da1da082fd24ed48120d08a6e7297dfcac771dcd"},
-    {file = "scipy-1.11.2.tar.gz", hash = "sha256:b29318a5e39bd200ca4381d80b065cdf3076c7d7281c5e36569e99273867f61d"},
+    {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"},
+    {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"},
+    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"},
+    {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"},
+    {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"},
+    {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"},
+    {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"},
+    {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"},
+    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"},
+    {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"},
+    {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"},
+    {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"},
+    {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"},
+    {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"},
+    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"},
+    {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"},
+    {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"},
+    {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"},
+    {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"},
+    {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"},
+    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"},
+    {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"},
+    {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"},
+    {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"},
+    {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"},
 ]
 
 [package.dependencies]
-numpy = ">=1.21.6,<1.28.0"
+numpy = ">=1.22.4,<2.3"
 
 [package.extras]
-dev = ["click", "cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"]
-doc = ["jupytext", "matplotlib (>2)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"]
-test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
+dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"]
+doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
+test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
 
 [[package]]
 name = "sentencepiece"
@@ -1916,19 +2772,18 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "68.1.2"
+version = "70.0.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-68.1.2-py3-none-any.whl", hash = "sha256:3d8083eed2d13afc9426f227b24fd1659489ec107c0e86cec2ffdde5c92e790b"},
-    {file = "setuptools-68.1.2.tar.gz", hash = "sha256:3d4dfa6d95f1b101d695a6160a7626e15583af71a5f52176efa5d39a054d475d"},
+    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
+    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5,<=7.1.2)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "six"
@@ -1943,82 +2798,158 @@ files = [
 
 [[package]]
 name = "sympy"
-version = "1.12"
+version = "1.12.1"
 description = "Computer algebra system (CAS) in Python"
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
-    {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
+    {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
+    {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
 ]
 
 [package.dependencies]
-mpmath = ">=0.19"
+mpmath = ">=1.1.0,<1.4.0"
+
+[[package]]
+name = "tbb"
+version = "2021.12.0"
+description = "Intel® oneAPI Threading Building Blocks (oneTBB)"
+optional = true
+python-versions = "*"
+files = [
+    {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"},
+    {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"},
+    {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"},
+    {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"},
+]
 
 [[package]]
 name = "texttable"
-version = "1.6.7"
+version = "1.7.0"
 description = "module to create simple ASCII tables"
 optional = true
 python-versions = "*"
 files = [
-    {file = "texttable-1.6.7-py2.py3-none-any.whl", hash = "sha256:b7b68139aa8a6339d2c320ca8b1dc42d13a7831a346b446cb9eb385f0c76310c"},
-    {file = "texttable-1.6.7.tar.gz", hash = "sha256:290348fb67f7746931bcdfd55ac7584ecd4e5b0846ab164333f0794b121760f2"},
+    {file = "texttable-1.7.0-py2.py3-none-any.whl", hash = "sha256:72227d592c82b3d7f672731ae73e4d1f88cd8e2ef5b075a7a7f01a23a3743917"},
+    {file = "texttable-1.7.0.tar.gz", hash = "sha256:2d2068fb55115807d3ac77a4ca68fa48803e84ebb0ee2340f858107a36522638"},
 ]
 
 [[package]]
 name = "tokenizers"
-version = "0.13.3"
-description = "Fast and Customizable Tokenizers"
+version = "0.19.1"
+description = ""
 optional = false
-python-versions = "*"
+python-versions = ">=3.7"
 files = [
-    {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"},
-    {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"},
-    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"},
-    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee0b1b311d65beab83d7a41c56a1e46ab732a9eed4460648e8eb0bd69fc2d059"},
-    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ef4215284df1277dadbcc5e17d4882bda19f770d02348e73523f7e7d8b8d396"},
-    {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4d53976079cff8a033f778fb9adca2d9d69d009c02fa2d71a878b5f3963ed30"},
-    {file = "tokenizers-0.13.3-cp310-cp310-win32.whl", hash = "sha256:1f0e3b4c2ea2cd13238ce43548959c118069db7579e5d40ec270ad77da5833ce"},
-    {file = "tokenizers-0.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:89649c00d0d7211e8186f7a75dfa1db6996f65edce4b84821817eadcc2d3c79e"},
-    {file = "tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:56b726e0d2bbc9243872b0144515ba684af5b8d8cd112fb83ee1365e26ec74c8"},
-    {file = "tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc5c022ce692e1f499d745af293ab9ee6f5d92538ed2faf73f9708c89ee59ce6"},
-    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55c981ac44ba87c93e847c333e58c12abcbb377a0c2f2ef96e1a266e4184ff2"},
-    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f247eae99800ef821a91f47c5280e9e9afaeed9980fc444208d5aa6ba69ff148"},
-    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e3215d048e94f40f1c95802e45dcc37c5b05eb46280fc2ccc8cd351bff839"},
-    {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ba2b0bf01777c9b9bc94b53764d6684554ce98551fec496f71bc5be3a03e98b"},
-    {file = "tokenizers-0.13.3-cp311-cp311-win32.whl", hash = "sha256:cc78d77f597d1c458bf0ea7c2a64b6aa06941c7a99cb135b5969b0278824d808"},
-    {file = "tokenizers-0.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:ecf182bf59bd541a8876deccf0360f5ae60496fd50b58510048020751cf1724c"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:0527dc5436a1f6bf2c0327da3145687d3bcfbeab91fed8458920093de3901b44"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07cbb2c307627dc99b44b22ef05ff4473aa7c7cc1fec8f0a8b37d8a64b1a16d2"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4560dbdeaae5b7ee0d4e493027e3de6d53c991b5002d7ff95083c99e11dd5ac0"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64064bd0322405c9374305ab9b4c07152a1474370327499911937fd4a76d004b"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8c6e2ab0f2e3d939ca66aa1d596602105fe33b505cd2854a4c1717f704c51de"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-win32.whl", hash = "sha256:6cc29d410768f960db8677221e497226e545eaaea01aa3613fa0fdf2cc96cff4"},
-    {file = "tokenizers-0.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fc2a7fdf864554a0dacf09d32e17c0caa9afe72baf9dd7ddedc61973bae352d8"},
-    {file = "tokenizers-0.13.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8791dedba834c1fc55e5f1521be325ea3dafb381964be20684b92fdac95d79b7"},
-    {file = "tokenizers-0.13.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:d607a6a13718aeb20507bdf2b96162ead5145bbbfa26788d6b833f98b31b26e1"},
-    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3791338f809cd1bf8e4fee6b540b36822434d0c6c6bc47162448deee3f77d425"},
-    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2f35f30e39e6aab8716f07790f646bdc6e4a853816cc49a95ef2a9016bf9ce6"},
-    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310204dfed5aa797128b65d63538a9837cbdd15da2a29a77d67eefa489edda26"},
-    {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0f9b92ea052305166559f38498b3b0cae159caea712646648aaa272f7160963"},
-    {file = "tokenizers-0.13.3-cp38-cp38-win32.whl", hash = "sha256:9a3fa134896c3c1f0da6e762d15141fbff30d094067c8f1157b9fdca593b5806"},
-    {file = "tokenizers-0.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:8e7b0cdeace87fa9e760e6a605e0ae8fc14b7d72e9fc19c578116f7287bb873d"},
-    {file = "tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:00cee1e0859d55507e693a48fa4aef07060c4bb6bd93d80120e18fea9371c66d"},
-    {file = "tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a23ff602d0797cea1d0506ce69b27523b07e70f6dda982ab8cf82402de839088"},
-    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ce07445050b537d2696022dafb115307abdffd2a5c106f029490f84501ef97"},
-    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:280ffe95f50eaaf655b3a1dc7ff1d9cf4777029dbbc3e63a74e65a056594abc3"},
-    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97acfcec592f7e9de8cadcdcda50a7134423ac8455c0166b28c9ff04d227b371"},
-    {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd7730c98a3010cd4f523465867ff95cd9d6430db46676ce79358f65ae39797b"},
-    {file = "tokenizers-0.13.3-cp39-cp39-win32.whl", hash = "sha256:48625a108029cb1ddf42e17a81b5a3230ba6888a70c9dc14e81bc319e812652d"},
-    {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"},
-    {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"},
+    {file = "tokenizers-0.19.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:952078130b3d101e05ecfc7fc3640282d74ed26bcf691400f872563fca15ac97"},
+    {file = "tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:82c8b8063de6c0468f08e82c4e198763e7b97aabfe573fd4cf7b33930ca4df77"},
+    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f03727225feaf340ceeb7e00604825addef622d551cbd46b7b775ac834c1e1c4"},
+    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:453e4422efdfc9c6b6bf2eae00d5e323f263fff62b29a8c9cd526c5003f3f642"},
+    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:02e81bf089ebf0e7f4df34fa0207519f07e66d8491d963618252f2e0729e0b46"},
+    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b07c538ba956843833fee1190cf769c60dc62e1cf934ed50d77d5502194d63b1"},
+    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28cab1582e0eec38b1f38c1c1fb2e56bce5dc180acb1724574fc5f47da2a4fe"},
+    {file = "tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e"},
+    {file = "tokenizers-0.19.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7fb297edec6c6841ab2e4e8f357209519188e4a59b557ea4fafcf4691d1b4c98"},
+    {file = "tokenizers-0.19.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2e8a3dd055e515df7054378dc9d6fa8c8c34e1f32777fb9a01fea81496b3f9d3"},
+    {file = "tokenizers-0.19.1-cp310-none-win32.whl", hash = "sha256:7ff898780a155ea053f5d934925f3902be2ed1f4d916461e1a93019cc7250837"},
+    {file = "tokenizers-0.19.1-cp310-none-win_amd64.whl", hash = "sha256:bea6f9947e9419c2fda21ae6c32871e3d398cba549b93f4a65a2d369662d9403"},
+    {file = "tokenizers-0.19.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:5c88d1481f1882c2e53e6bb06491e474e420d9ac7bdff172610c4f9ad3898059"},
+    {file = "tokenizers-0.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ddf672ed719b4ed82b51499100f5417d7d9f6fb05a65e232249268f35de5ed14"},
+    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:dadc509cc8a9fe460bd274c0e16ac4184d0958117cf026e0ea8b32b438171594"},
+    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfedf31824ca4915b511b03441784ff640378191918264268e6923da48104acc"},
+    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac11016d0a04aa6487b1513a3a36e7bee7eec0e5d30057c9c0408067345c48d2"},
+    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76951121890fea8330d3a0df9a954b3f2a37e3ec20e5b0530e9a0044ca2e11fe"},
+    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b342d2ce8fc8d00f376af068e3274e2e8649562e3bc6ae4a67784ded6b99428d"},
+    {file = "tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d16ff18907f4909dca9b076b9c2d899114dd6abceeb074eca0c93e2353f943aa"},
+    {file = "tokenizers-0.19.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:706a37cc5332f85f26efbe2bdc9ef8a9b372b77e4645331a405073e4b3a8c1c6"},
+    {file = "tokenizers-0.19.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16baac68651701364b0289979ecec728546133e8e8fe38f66fe48ad07996b88b"},
+    {file = "tokenizers-0.19.1-cp311-none-win32.whl", hash = "sha256:9ed240c56b4403e22b9584ee37d87b8bfa14865134e3e1c3fb4b2c42fafd3256"},
+    {file = "tokenizers-0.19.1-cp311-none-win_amd64.whl", hash = "sha256:ad57d59341710b94a7d9dbea13f5c1e7d76fd8d9bcd944a7a6ab0b0da6e0cc66"},
+    {file = "tokenizers-0.19.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:621d670e1b1c281a1c9698ed89451395d318802ff88d1fc1accff0867a06f153"},
+    {file = "tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d924204a3dbe50b75630bd16f821ebda6a5f729928df30f582fb5aade90c818a"},
+    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4f3fefdc0446b1a1e6d81cd4c07088ac015665d2e812f6dbba4a06267d1a2c95"},
+    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9620b78e0b2d52ef07b0d428323fb34e8ea1219c5eac98c2596311f20f1f9266"},
+    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04ce49e82d100594715ac1b2ce87d1a36e61891a91de774755f743babcd0dd52"},
+    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5c2ff13d157afe413bf7e25789879dd463e5a4abfb529a2d8f8473d8042e28f"},
+    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3174c76efd9d08f836bfccaca7cfec3f4d1c0a4cf3acbc7236ad577cc423c840"},
+    {file = "tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c9d5b6c0e7a1e979bec10ff960fae925e947aab95619a6fdb4c1d8ff3708ce3"},
+    {file = "tokenizers-0.19.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a179856d1caee06577220ebcfa332af046d576fb73454b8f4d4b0ba8324423ea"},
+    {file = "tokenizers-0.19.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:952b80dac1a6492170f8c2429bd11fcaa14377e097d12a1dbe0ef2fb2241e16c"},
+    {file = "tokenizers-0.19.1-cp312-none-win32.whl", hash = "sha256:01d62812454c188306755c94755465505836fd616f75067abcae529c35edeb57"},
+    {file = "tokenizers-0.19.1-cp312-none-win_amd64.whl", hash = "sha256:b70bfbe3a82d3e3fb2a5e9b22a39f8d1740c96c68b6ace0086b39074f08ab89a"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:bb9dfe7dae85bc6119d705a76dc068c062b8b575abe3595e3c6276480e67e3f1"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:1f0360cbea28ea99944ac089c00de7b2e3e1c58f479fb8613b6d8d511ce98267"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:71e3ec71f0e78780851fef28c2a9babe20270404c921b756d7c532d280349214"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b82931fa619dbad979c0ee8e54dd5278acc418209cc897e42fac041f5366d626"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e8ff5b90eabdcdaa19af697885f70fe0b714ce16709cf43d4952f1f85299e73a"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e742d76ad84acbdb1a8e4694f915fe59ff6edc381c97d6dfdd054954e3478ad4"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d8c5d59d7b59885eab559d5bc082b2985555a54cda04dda4c65528d90ad252ad"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b2da5c32ed869bebd990c9420df49813709e953674c0722ff471a116d97b22d"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:638e43936cc8b2cbb9f9d8dde0fe5e7e30766a3318d2342999ae27f68fdc9bd6"},
+    {file = "tokenizers-0.19.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:78e769eb3b2c79687d9cb0f89ef77223e8e279b75c0a968e637ca7043a84463f"},
+    {file = "tokenizers-0.19.1-cp37-none-win32.whl", hash = "sha256:72791f9bb1ca78e3ae525d4782e85272c63faaef9940d92142aa3eb79f3407a3"},
+    {file = "tokenizers-0.19.1-cp37-none-win_amd64.whl", hash = "sha256:f3bbb7a0c5fcb692950b041ae11067ac54826204318922da754f908d95619fbc"},
+    {file = "tokenizers-0.19.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:07f9295349bbbcedae8cefdbcfa7f686aa420be8aca5d4f7d1ae6016c128c0c5"},
+    {file = "tokenizers-0.19.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10a707cc6c4b6b183ec5dbfc5c34f3064e18cf62b4a938cb41699e33a99e03c1"},
+    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6309271f57b397aa0aff0cbbe632ca9d70430839ca3178bf0f06f825924eca22"},
+    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ad23d37d68cf00d54af184586d79b84075ada495e7c5c0f601f051b162112dc"},
+    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:427c4f0f3df9109314d4f75b8d1f65d9477033e67ffaec4bca53293d3aca286d"},
+    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e83a31c9cf181a0a3ef0abad2b5f6b43399faf5da7e696196ddd110d332519ee"},
+    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c27b99889bd58b7e301468c0838c5ed75e60c66df0d4db80c08f43462f82e0d3"},
+    {file = "tokenizers-0.19.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac0b0eb952412b0b196ca7a40e7dce4ed6f6926489313414010f2e6b9ec2adf"},
+    {file = "tokenizers-0.19.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8a6298bde623725ca31c9035a04bf2ef63208d266acd2bed8c2cb7d2b7d53ce6"},
+    {file = "tokenizers-0.19.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:08a44864e42fa6d7d76d7be4bec62c9982f6f6248b4aa42f7302aa01e0abfd26"},
+    {file = "tokenizers-0.19.1-cp38-none-win32.whl", hash = "sha256:1de5bc8652252d9357a666e609cb1453d4f8e160eb1fb2830ee369dd658e8975"},
+    {file = "tokenizers-0.19.1-cp38-none-win_amd64.whl", hash = "sha256:0bcce02bf1ad9882345b34d5bd25ed4949a480cf0e656bbd468f4d8986f7a3f1"},
+    {file = "tokenizers-0.19.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:0b9394bd204842a2a1fd37fe29935353742be4a3460b6ccbaefa93f58a8df43d"},
+    {file = "tokenizers-0.19.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4692ab92f91b87769d950ca14dbb61f8a9ef36a62f94bad6c82cc84a51f76f6a"},
+    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6258c2ef6f06259f70a682491c78561d492e885adeaf9f64f5389f78aa49a051"},
+    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85cf76561fbd01e0d9ea2d1cbe711a65400092bc52b5242b16cfd22e51f0c58"},
+    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670b802d4d82bbbb832ddb0d41df7015b3e549714c0e77f9bed3e74d42400fbe"},
+    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:85aa3ab4b03d5e99fdd31660872249df5e855334b6c333e0bc13032ff4469c4a"},
+    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbf001afbbed111a79ca47d75941e9e5361297a87d186cbfc11ed45e30b5daba"},
+    {file = "tokenizers-0.19.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c89aa46c269e4e70c4d4f9d6bc644fcc39bb409cb2a81227923404dd6f5227"},
+    {file = "tokenizers-0.19.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:39c1ec76ea1027438fafe16ecb0fb84795e62e9d643444c1090179e63808c69d"},
+    {file = "tokenizers-0.19.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c2a0d47a89b48d7daa241e004e71fb5a50533718897a4cd6235cb846d511a478"},
+    {file = "tokenizers-0.19.1-cp39-none-win32.whl", hash = "sha256:61b7fe8886f2e104d4caf9218b157b106207e0f2a4905c9c7ac98890688aabeb"},
+    {file = "tokenizers-0.19.1-cp39-none-win_amd64.whl", hash = "sha256:f97660f6c43efd3e0bfd3f2e3e5615bf215680bad6ee3d469df6454b8c6e8256"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3b11853f17b54c2fe47742c56d8a33bf49ce31caf531e87ac0d7d13d327c9334"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d26194ef6c13302f446d39972aaa36a1dda6450bc8949f5eb4c27f51191375bd"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e8d1ed93beda54bbd6131a2cb363a576eac746d5c26ba5b7556bc6f964425594"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca407133536f19bdec44b3da117ef0d12e43f6d4b56ac4c765f37eca501c7bda"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce05fde79d2bc2e46ac08aacbc142bead21614d937aac950be88dc79f9db9022"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:35583cd46d16f07c054efd18b5d46af4a2f070a2dd0a47914e66f3ff5efb2b1e"},
+    {file = "tokenizers-0.19.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:43350270bfc16b06ad3f6f07eab21f089adb835544417afda0f83256a8bf8b75"},
+    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b4399b59d1af5645bcee2072a463318114c39b8547437a7c2d6a186a1b5a0e2d"},
+    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6852c5b2a853b8b0ddc5993cd4f33bfffdca4fcc5d52f89dd4b8eada99379285"},
+    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bcd266ae85c3d39df2f7e7d0e07f6c41a55e9a3123bb11f854412952deacd828"},
+    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecb2651956eea2aa0a2d099434134b1b68f1c31f9a5084d6d53f08ed43d45ff2"},
+    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:b279ab506ec4445166ac476fb4d3cc383accde1ea152998509a94d82547c8e2a"},
+    {file = "tokenizers-0.19.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:89183e55fb86e61d848ff83753f64cded119f5d6e1f553d14ffee3700d0a4a49"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2edbc75744235eea94d595a8b70fe279dd42f3296f76d5a86dde1d46e35f574"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:0e64bfde9a723274e9a71630c3e9494ed7b4c0f76a1faacf7fe294cd26f7ae7c"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0b5ca92bfa717759c052e345770792d02d1f43b06f9e790ca0a1db62838816f3"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f8a20266e695ec9d7a946a019c1d5ca4eddb6613d4f466888eee04f16eedb85"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63c38f45d8f2a2ec0f3a20073cccb335b9f99f73b3c69483cd52ebc75369d8a1"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dd26e3afe8a7b61422df3176e06664503d3f5973b94f45d5c45987e1cb711876"},
+    {file = "tokenizers-0.19.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:eddd5783a4a6309ce23432353cdb36220e25cbb779bfa9122320666508b44b88"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:56ae39d4036b753994476a1b935584071093b55c7a72e3b8288e68c313ca26e7"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f9939ca7e58c2758c01b40324a59c034ce0cebad18e0d4563a9b1beab3018243"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6c330c0eb815d212893c67a032e9dc1b38a803eccb32f3e8172c19cc69fbb439"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec11802450a2487cdf0e634b750a04cbdc1c4d066b97d94ce7dd2cb51ebb325b"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2b718f316b596f36e1dae097a7d5b91fc5b85e90bf08b01ff139bd8953b25af"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:ed69af290c2b65169f0ba9034d1dc39a5db9459b32f1dd8b5f3f32a3fcf06eab"},
+    {file = "tokenizers-0.19.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f8a9c828277133af13f3859d1b6bf1c3cb6e9e1637df0e45312e6b7c2e622b1f"},
+    {file = "tokenizers-0.19.1.tar.gz", hash = "sha256:ee59e6680ed0fdbe6b724cf38bd70400a0c1dd623b07ac729087270caeac88e3"},
 ]
 
+[package.dependencies]
+huggingface-hub = ">=0.16.4,<1.0"
+
 [package.extras]
-dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"]
+dev = ["tokenizers[testing]"]
 docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
-testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"]
+testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
 
 [[package]]
 name = "tomli"
@@ -2033,52 +2964,67 @@ files = [
 
 [[package]]
 name = "torch"
-version = "2.0.1"
+version = "2.3.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
-optional = false
+optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:8ced00b3ba471856b993822508f77c98f48a458623596a4c43136158781e306a"},
-    {file = "torch-2.0.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:359bfaad94d1cda02ab775dc1cc386d585712329bb47b8741607ef6ef4950747"},
-    {file = "torch-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c84e44d9002182edd859f3400deaa7410f5ec948a519cc7ef512c2f9b34d2c4"},
-    {file = "torch-2.0.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:567f84d657edc5582d716900543e6e62353dbe275e61cdc36eda4929e46df9e7"},
-    {file = "torch-2.0.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:787b5a78aa7917465e9b96399b883920c88a08f4eb63b5a5d2d1a16e27d2f89b"},
-    {file = "torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e617b1d0abaf6ced02dbb9486803abfef0d581609b09641b34fa315c9c40766d"},
-    {file = "torch-2.0.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b6019b1de4978e96daa21d6a3ebb41e88a0b474898fe251fd96189587408873e"},
-    {file = "torch-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbd68cbd1cd9da32fe5d294dd3411509b3d841baecb780b38b3b7b06c7754434"},
-    {file = "torch-2.0.1-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:ef654427d91600129864644e35deea761fb1fe131710180b952a6f2e2207075e"},
-    {file = "torch-2.0.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:25aa43ca80dcdf32f13da04c503ec7afdf8e77e3a0183dd85cd3e53b2842e527"},
-    {file = "torch-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5ef3ea3d25441d3957348f7e99c7824d33798258a2bf5f0f0277cbcadad2e20d"},
-    {file = "torch-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0882243755ff28895e8e6dc6bc26ebcf5aa0911ed81b2a12f241fc4b09075b13"},
-    {file = "torch-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:f66aa6b9580a22b04d0af54fcd042f52406a8479e2b6a550e3d9f95963e168c8"},
-    {file = "torch-2.0.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:1adb60d369f2650cac8e9a95b1d5758e25d526a34808f7448d0bd599e4ae9072"},
-    {file = "torch-2.0.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:1bcffc16b89e296826b33b98db5166f990e3b72654a2b90673e817b16c50e32b"},
-    {file = "torch-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e10e1597f2175365285db1b24019eb6f04d53dcd626c735fc502f1e8b6be9875"},
-    {file = "torch-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:423e0ae257b756bb45a4b49072046772d1ad0c592265c5080070e0767da4e490"},
-    {file = "torch-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8742bdc62946c93f75ff92da00e3803216c6cce9b132fbca69664ca38cfb3e18"},
-    {file = "torch-2.0.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:c62df99352bd6ee5a5a8d1832452110435d178b5164de450831a3a8cc14dc680"},
-    {file = "torch-2.0.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:671a2565e3f63b8fe8e42ae3e36ad249fe5e567435ea27b94edaa672a7d0c416"},
+    {file = "torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:d8ea5a465dbfd8501f33c937d1f693176c9aef9d1c1b0ca1d44ed7b0a18c52ac"},
+    {file = "torch-2.3.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:09c81c5859a5b819956c6925a405ef1cdda393c9d8a01ce3851453f699d3358c"},
+    {file = "torch-2.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:1bf023aa20902586f614f7682fedfa463e773e26c58820b74158a72470259459"},
+    {file = "torch-2.3.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:758ef938de87a2653bba74b91f703458c15569f1562bf4b6c63c62d9c5a0c1f5"},
+    {file = "torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:493d54ee2f9df100b5ce1d18c96dbb8d14908721f76351e908c9d2622773a788"},
+    {file = "torch-2.3.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:bce43af735c3da16cc14c7de2be7ad038e2fbf75654c2e274e575c6c05772ace"},
+    {file = "torch-2.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:729804e97b7cf19ae9ab4181f91f5e612af07956f35c8b2c8e9d9f3596a8e877"},
+    {file = "torch-2.3.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:d24e328226d8e2af7cf80fcb1d2f1d108e0de32777fab4aaa2b37b9765d8be73"},
+    {file = "torch-2.3.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:b0de2bdc0486ea7b14fc47ff805172df44e421a7318b7c4d92ef589a75d27410"},
+    {file = "torch-2.3.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:a306c87a3eead1ed47457822c01dfbd459fe2920f2d38cbdf90de18f23f72542"},
+    {file = "torch-2.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9b98bf1a3c8af2d4c41f0bf1433920900896c446d1ddc128290ff146d1eb4bd"},
+    {file = "torch-2.3.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:dca986214267b34065a79000cee54232e62b41dff1ec2cab9abc3fc8b3dee0ad"},
+    {file = "torch-2.3.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:20572f426965dd8a04e92a473d7e445fa579e09943cc0354f3e6fef6130ce061"},
+    {file = "torch-2.3.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e65ba85ae292909cde0dde6369826d51165a3fc8823dc1854cd9432d7f79b932"},
+    {file = "torch-2.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:5515503a193781fd1b3f5c474e89c9dfa2faaa782b2795cc4a7ab7e67de923f6"},
+    {file = "torch-2.3.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:6ae9f64b09516baa4ef890af0672dc981c20b1f0d829ce115d4420a247e88fba"},
+    {file = "torch-2.3.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cd0dc498b961ab19cb3f8dbf0c6c50e244f2f37dbfa05754ab44ea057c944ef9"},
+    {file = "torch-2.3.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e05f836559251e4096f3786ee99f4a8cbe67bc7fbedba8ad5e799681e47c5e80"},
+    {file = "torch-2.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:4fb27b35dbb32303c2927da86e27b54a92209ddfb7234afb1949ea2b3effffea"},
+    {file = "torch-2.3.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:760f8bedff506ce9e6e103498f9b1e9e15809e008368594c3a66bf74a8a51380"},
 ]
 
 [package.dependencies]
 filelock = "*"
+fsspec = "*"
 jinja2 = "*"
+mkl = {version = ">=2021.1.1,<=2021.4.0", markers = "platform_system == \"Windows\""}
 networkx = "*"
+nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cudnn-cu12 = {version = "8.9.2.26", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 sympy = "*"
-typing-extensions = "*"
+triton = {version = "2.3.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.12\""}
+typing-extensions = ">=4.8.0"
 
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
+optree = ["optree (>=0.9.1)"]
 
 [[package]]
 name = "tqdm"
-version = "4.66.1"
+version = "4.66.4"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
-    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+    {file = "tqdm-4.66.4-py3-none-any.whl", hash = "sha256:b75ca56b413b030bc3f00af51fd2c1a1a5eac6a0c1cca83cbb37a5c52abce644"},
+    {file = "tqdm-4.66.4.tar.gz", hash = "sha256:e4d936c9de8727928f3be6079590e97d9abfe8d39a590be678eb5919ffc186bb"},
 ]
 
 [package.dependencies]
@@ -2092,72 +3038,92 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.32.1"
+version = "4.41.2"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.32.1-py3-none-any.whl", hash = "sha256:b930d3dbd907a3f300cf49e54d63a56f8a0ab16b01a2c2a61ecff37c6de1da08"},
-    {file = "transformers-4.32.1.tar.gz", hash = "sha256:1edc8ae1de357d97c3d36b04412aa63d55e6fc0c4b39b419a7d380ed947d2252"},
+    {file = "transformers-4.41.2-py3-none-any.whl", hash = "sha256:05555d20e43f808de1ef211ab64803cdb513170cef70d29a888b589caebefc67"},
+    {file = "transformers-4.41.2.tar.gz", hash = "sha256:80a4db216533d573e9cc7388646c31ed9480918feb7c55eb211249cb23567f87"},
 ]
 
 [package.dependencies]
 filelock = "*"
-huggingface-hub = ">=0.15.1,<1.0"
+huggingface-hub = ">=0.23.0,<1.0"
 numpy = ">=1.17"
 packaging = ">=20.0"
 pyyaml = ">=5.1"
 regex = "!=2019.12.17"
 requests = "*"
-safetensors = ">=0.3.1"
-tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.14"
+safetensors = ">=0.4.1"
+tokenizers = ">=0.19,<0.20"
 tqdm = ">=4.27"
 
 [package.extras]
-accelerate = ["accelerate (>=0.20.3)"]
-agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.9,!=1.12.0)"]
-all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision"]
+accelerate = ["accelerate (>=0.21.0)"]
+agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"]
+all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision"]
 audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 codecarbon = ["codecarbon (==1.2.0)"]
-deepspeed = ["accelerate (>=0.20.3)", "deepspeed (>=0.9.3)"]
-deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "timeout-decorator"]
-dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "urllib3 (<2.0.0)"]
-dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
-docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "torchaudio", "torchvision"]
-docs-specific = ["hf-doc-builder"]
-fairscale = ["fairscale (>0.3)"]
-flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"]
+deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"]
+deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.19,<0.20)", "urllib3 (<2.0.0)"]
+dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.19,<0.20)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
+flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
 flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 ftfy = ["ftfy"]
-integrations = ["optuna", "ray[tune]", "sigopt"]
+integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"]
 ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
 modelcreation = ["cookiecutter (==1.7.3)"]
-natten = ["natten (>=0.14.6)"]
+natten = ["natten (>=0.14.6,<0.15.0)"]
 onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
 onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
 optuna = ["optuna"]
-quality = ["GitPython (<3.1.19)", "black (>=23.1,<24.0)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (>=0.0.241,<=0.0.259)", "urllib3 (<2.0.0)"]
-ray = ["ray[tune]"]
+quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"]
+ray = ["ray[tune] (>=2.7.0)"]
 retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
 sagemaker = ["sagemaker (>=2.31.0)"]
 sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
-serving = ["fastapi", "pydantic (<2)", "starlette", "uvicorn"]
+serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
 sigopt = ["sigopt"]
 sklearn = ["scikit-learn"]
 speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-testing = ["GitPython (<3.1.19)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "timeout-decorator"]
-tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx"]
-tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.14)", "tensorflow-text (<2.14)", "tf2onnx"]
+testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
+tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
+tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
 tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
 timm = ["timm"]
-tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.14)"]
-torch = ["accelerate (>=0.20.3)", "torch (>=1.9,!=1.12.0)"]
+tokenizers = ["tokenizers (>=0.19,<0.20)"]
+torch = ["accelerate (>=0.21.0)", "torch"]
 torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
-torch-vision = ["Pillow (<10.0.0)", "torchvision"]
-torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.9,!=1.12.0)", "tqdm (>=4.27)"]
+torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
+torchhub = ["filelock", "huggingface-hub (>=0.23.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.19,<0.20)", "torch", "tqdm (>=4.27)"]
 video = ["av (==9.2.0)", "decord (==0.6.0)"]
-vision = ["Pillow (<10.0.0)"]
+vision = ["Pillow (>=10.0.1,<=15.0)"]
+
+[[package]]
+name = "triton"
+version = "2.3.0"
+description = "A language and compiler for custom Deep Learning operations"
+optional = true
+python-versions = "*"
+files = [
+    {file = "triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ce4b8ff70c48e47274c66f269cce8861cf1dc347ceeb7a67414ca151b1822d8"},
+    {file = "triton-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c3d9607f85103afdb279938fc1dd2a66e4f5999a58eb48a346bd42738f986dd"},
+    {file = "triton-2.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:218d742e67480d9581bafb73ed598416cc8a56f6316152e5562ee65e33de01c0"},
+    {file = "triton-2.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:381ec6b3dac06922d3e4099cfc943ef032893b25415de295e82b1a82b0359d2c"},
+    {file = "triton-2.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:038e06a09c06a164fef9c48de3af1e13a63dc1ba3c792871e61a8e79720ea440"},
+    {file = "triton-2.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d8f636e0341ac348899a47a057c3daea99ea7db31528a225a3ba4ded28ccc65"},
+]
+
+[package.dependencies]
+filelock = "*"
+
+[package.extras]
+build = ["cmake (>=3.20)", "lit"]
+tests = ["autopep8", "flake8", "isort", "numpy", "pytest", "scipy (>=1.7.1)", "torch"]
+tutorials = ["matplotlib", "pandas", "tabulate", "torch"]
 
 [[package]]
 name = "typer"
@@ -2181,40 +3147,40 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
 
 [[package]]
 name = "typing-extensions"
-version = "4.7.1"
-description = "Backported and Experimental Type Hints for Python 3.7+"
+version = "4.12.1"
+description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"},
-    {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
+    {file = "typing_extensions-4.12.1-py3-none-any.whl", hash = "sha256:6024b58b69089e5a89c347397254e35f1bf02a907728ec7fee9bf0fe837d203a"},
+    {file = "typing_extensions-4.12.1.tar.gz", hash = "sha256:915f5e35ff76f56588223f15fdd5938f9a1cf9195c0de25130c627e4d597f6d1"},
 ]
 
 [[package]]
 name = "tzdata"
-version = "2023.3"
+version = "2024.1"
 description = "Provider of IANA time zone data"
 optional = true
 python-versions = ">=2"
 files = [
-    {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"},
-    {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"},
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
 ]
 
 [[package]]
 name = "urllib3"
-version = "2.0.4"
+version = "2.2.1"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.0.4-py3-none-any.whl", hash = "sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4"},
-    {file = "urllib3-2.0.4.tar.gz", hash = "sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11"},
+    {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
+    {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
 ]
 
 [package.extras]
 brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
+h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
@@ -2234,275 +3200,327 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 
 [[package]]
 name = "wrapt"
-version = "1.15.0"
+version = "1.16.0"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+python-versions = ">=3.6"
 files = [
-    {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46"},
-    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e"},
-    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a"},
-    {file = "wrapt-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923"},
-    {file = "wrapt-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee"},
-    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727"},
-    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7"},
-    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0"},
-    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec"},
-    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90"},
-    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975"},
-    {file = "wrapt-1.15.0-cp310-cp310-win32.whl", hash = "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1"},
-    {file = "wrapt-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e"},
-    {file = "wrapt-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7"},
-    {file = "wrapt-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72"},
-    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb"},
-    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e"},
-    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c"},
-    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3"},
-    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92"},
-    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98"},
-    {file = "wrapt-1.15.0-cp311-cp311-win32.whl", hash = "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416"},
-    {file = "wrapt-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb"},
-    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248"},
-    {file = "wrapt-1.15.0-cp35-cp35m-win32.whl", hash = "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559"},
-    {file = "wrapt-1.15.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"},
-    {file = "wrapt-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba"},
-    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752"},
-    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364"},
-    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475"},
-    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8"},
-    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418"},
-    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2"},
-    {file = "wrapt-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1"},
-    {file = "wrapt-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420"},
-    {file = "wrapt-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317"},
-    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e"},
-    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e"},
-    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0"},
-    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019"},
-    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034"},
-    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653"},
-    {file = "wrapt-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0"},
-    {file = "wrapt-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e"},
-    {file = "wrapt-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145"},
-    {file = "wrapt-1.15.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f"},
-    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd"},
-    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b"},
-    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f"},
-    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6"},
-    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094"},
-    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7"},
-    {file = "wrapt-1.15.0-cp38-cp38-win32.whl", hash = "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b"},
-    {file = "wrapt-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1"},
-    {file = "wrapt-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86"},
-    {file = "wrapt-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c"},
-    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d"},
-    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc"},
-    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29"},
-    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a"},
-    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8"},
-    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9"},
-    {file = "wrapt-1.15.0-cp39-cp39-win32.whl", hash = "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff"},
-    {file = "wrapt-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6"},
-    {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"},
-    {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"},
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
+    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
+    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
+    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
+    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
+    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
+    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
+    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
+    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
+    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
+    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
+    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
+    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
+    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
 ]
 
 [[package]]
 name = "xxhash"
-version = "3.3.0"
+version = "3.4.1"
 description = "Python binding for xxHash"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "xxhash-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70ef7288d1cb1ad16e02d101ea43bb0e392d985d60b9b0035aee80663530960d"},
-    {file = "xxhash-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:44ff8c673cab50be46784e0aec62aa6f0ca9ea765e2b0690e8945d0cd950dcaf"},
-    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfebc90273ae2beb813d8118a2bfffb5a5a81ac054fbfd061ea18fd0a81db0ac"},
-    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9084e68bedbd665c7e9241a7b597c28f4775edeb3941bf608ecb38732a5f8fb5"},
-    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72493a14a3e89564b1a6c7400b9b40621e8f4692410706ef27c66aeadc7b431"},
-    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98779cbe9068dd7734cc3210693894d5cc9b156920e9c336f10fb99f46bebbd8"},
-    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:499f8a12767dd28b98ab6b7c7da7d294564e4c9024a2aaa5d0b0b98a8bef2f92"},
-    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4dabda7f42c548f98d8e07e390bda2953fc58302c0e07ded7b3fe0637e7ecd2f"},
-    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c416409646c793c46370f0f1859253302ee70aeda5278c2a0ca41462f8ec1244"},
-    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b8bd31aaad8a80a7302730676cec26bea3ef1fd9835875aa47fea073aca9fe05"},
-    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:3af8e3bcd630f905efbdfe7a51b51fc1ca3c9dca8b155f841925f3ad41685d41"},
-    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d86b79c707fc7025d967af71db652429a06a8179175e45bd2e9f17b8af6f5949"},
-    {file = "xxhash-3.3.0-cp310-cp310-win32.whl", hash = "sha256:98fe771f36ee9d3a1f5741424a956a2ba9651d9508a9f64a024b57f2cf796414"},
-    {file = "xxhash-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:0a65131f7f731ecf7e3dd27f09d877aff3000a79a446caaa2c0d8d0ec0bc7186"},
-    {file = "xxhash-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a9761e425e79d23797fa0bec2d781dbadb9fe5dcc2bf69030855f5e393c3bec8"},
-    {file = "xxhash-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d28c7ef1deb3c3ac5f5290176ca3d501daa97c2e1f7443bf5d8b61ac651794b2"},
-    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:701b7cefffc25de1b7ddfae6505da70a3b3a11e312c2e2b33b09e180bbceb43d"},
-    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1644f8b8e19a242c3047a089541067248a651038cabb9fcab3c13eb1dfcd757"},
-    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20e7d0e3488cc0f0dbe360731b7fe32e1f2df46bf2de2db3317d301efb93084c"},
-    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:156c52eca2b20f9839723bef0b929a290f6c2f1c98ccb24e82f58f96f3c16007"},
-    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d6ce4d3828d79044ed08994e196c20f69c18133ed8a4286afe3e98989adeeac"},
-    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b85b63757ade2439c8d7d71842c40d42c0ab3b69279ed02afbd3b1635f7d2b4b"},
-    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b2b9051e40b7b649a9a2a38fb223ca6a593d332012df885746b81968948f9435"},
-    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:81b7ce050f26fc1daaaa0d24e320815306736d14608e1ba31920e693a7ca9afb"},
-    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:7442500fcce71669953ca959682dcd47452bc3f9c95c8d88315874aeabec9f82"},
-    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:36a05bf59a515cfb07f3f83373c527fff2ecaa77eaf30c968c788aea582070a1"},
-    {file = "xxhash-3.3.0-cp311-cp311-win32.whl", hash = "sha256:da16f9cd62c6fde74683be1b28c28ef865e706da13e3bee4ba836fcc520de0cc"},
-    {file = "xxhash-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:40fd49ef6964b1c90c0bea63cd184f6d0b36e59144a080e8b3ac2c4c06bf6bf2"},
-    {file = "xxhash-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:672c60cce1f8026ae32c651f877aa64f342876083a36a4b1ff91bc876aaf0e34"},
-    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bb6c83d7a65dd3065566c77425ba72df96982174e8ef613d809052d68ae77ab"},
-    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a4170f3016b621e3200ebfcc18de6f50eb8e8fc1303e16324b1f5625afd51b57"},
-    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bfb9c45d502ab38c0f4edf98a678694ae0f345613ef4900ade98c71f64db4d78"},
-    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48af026a2b1569666da42a478248a1f03f4e2350a34eb661afe3cb45429ca1d7"},
-    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe627de8fe8ddfa8b6477bda4ae5d5843ad1a0c83601dcff72247039465cc901"},
-    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:427fc60a188e345534f35b0aa76f7640c5ddf0354f1c9ad826a2bc086282982d"},
-    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d80acb20c7f268fe3150ac0be6a6b798062af56a1795eef855b26c9eae11a99c"},
-    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e71100818943422d1fbbe460e7be7fc4f2d2ba9371b2a745eb09e29ef0493f4a"},
-    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:e3b9bb5fdbe284c7b61c5d82c76688e52bbaf48ab1e53de98c072cc696fa331f"},
-    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1e25f6c8c46cf1ed8237f610abb231093a748c97d6c2c092789a7cad7e7ef290"},
-    {file = "xxhash-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:928208dfecc563be59ae91868d1658e78809cb1e6a0bd74960a96c915db6390c"},
-    {file = "xxhash-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:bd1b4531a66da6dde1974662c1fd6fb1a2f27e40542e3df5e5e5dbab8ea4aee7"},
-    {file = "xxhash-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:deebb296df92e082b6d0171a7d6227b503e2897cea4f8bdd3d708094974d4cf6"},
-    {file = "xxhash-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd96e9cb0e2baa294e6d572207d9731c3bb8e2511f1ff70f2bf17266b4488bd9"},
-    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3756b44bf247e422a2e47a38f25d03cf4a5ed539fdc2be3c60043e872e6ff13d"},
-    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69550c3c053b8f135ceac97b85dc1b2bc54b7613a966f550f32b43bed81c788a"},
-    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fc8736fc3e0c5aad435520873b9d2e27ddcc5a830b07e00e9c4d3a61ded9675"},
-    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80ead7774392efbd95f9f701155048f9ca26cf55133db6f5bb5a0ec69376bda5"},
-    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8737c9b3fd944d856faafa92c95f6198649ad57987935b6d965d086938be917"},
-    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2c8e078d0b9f85212801c41bd9eec8122003929686b0ee33360ffbfdf1a189ab"},
-    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f399269d20ef1dd910331f9ad49e8510c3ba2aa657b623293b536038f266a5c5"},
-    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f3661decef5f9ff7ab50edbef463bf7dc717621b56755dbae5458a946a033b10"},
-    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ec374d0f1e7d43ef48a4ff643600833d7a325ecc6933b4d6ad9282f55751cf7"},
-    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:39a947ff02d9a85673f5ce1f6f34059e24c714a797440485bd81b2c3cb69a7ff"},
-    {file = "xxhash-3.3.0-cp38-cp38-win32.whl", hash = "sha256:4a4f0645a0ec03b229fb04f2e66bdbcb1ffd341a70d6c86c3ee015ffdcd70fad"},
-    {file = "xxhash-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:8af5a687c0fb4357c230eec8a57ca07d3172faa3cb69beb0cbad40672ae6fa4b"},
-    {file = "xxhash-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e5bfafda019ecc6202af6f3cf08220fa66af9612ba16ef831033ae3ac7bd1f89"},
-    {file = "xxhash-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d113b433bc817adf845689a051363777835577858263ec4325d1934fcb7e394"},
-    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56aacf4bf65f575c0392be958aceff719d850950bb6af7d804b32d4bc293159c"},
-    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f5d3e4e0937dad05585e9bd772bbdf0ca40cd8b2f54789d7a1f3091b608118c"},
-    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23605d7fc67bc7daa0d263b3a26de3375cfcc0b51ab7de5026625415c05b6fed"},
-    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe525be0392d493558a2b10d764bcaae9850cc262b417176a8b001f16e085fc6"},
-    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b234d08786884f5c8d55dfebb839cfbd846d812e3a052c39ca7e8ce7055fed68"},
-    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b031395b4b9c3085d9ea1ce89896ab01a65fc63172b2bfda5dd318fefe5e2f93"},
-    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:5afe44da46b48c75169e622a532dca3fe585343c0577cfd7c18ecd3f1200305d"},
-    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c59f233f38b6a49d5e4ddf16be910a5bbf36a2989b6b2c8591853fb9f5a5e691"},
-    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:ed016e278c5c4633270903c7cf3b9dfb0bd293b7335e43fe695cb95541da53c9"},
-    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a8bd6612fb35487e9ab329bb37b3df44f58baf752010dde9282593edbfed7e7"},
-    {file = "xxhash-3.3.0-cp39-cp39-win32.whl", hash = "sha256:015a0498bde85364abc53fcc713af962dd4555391929736d9c0ff2c555436a03"},
-    {file = "xxhash-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:06a484097af32caf1cfffadd60c3ca140c9e52b40a551fb1f6f0fdfd6f7f8977"},
-    {file = "xxhash-3.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c3809740124bbc777d29e3ae53de24f4c13fd5e62878086a8feadf0dcb654a5"},
-    {file = "xxhash-3.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae092f0daaeece2acdd6ec46e2ab307d8d6f22b01ecca14dc6078844dbd88339"},
-    {file = "xxhash-3.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3498e72ff2610b049b97bb81d1ea6e7bfa5b7a45efb3f255d77ec2fa2bc91653"},
-    {file = "xxhash-3.3.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0004dded9d86f129961326e980420187640fb7ba65a184009429861c1d09df7"},
-    {file = "xxhash-3.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:41c8bfd27191928bae6fd2b66872965532267785094a03c0ee5f358d9dba51c2"},
-    {file = "xxhash-3.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:71db8498e329cef3588b0617f762a3fe31d899872e76a68ce2840e35a1318a5b"},
-    {file = "xxhash-3.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d1d24d71b6209bc0124286932c4f0660c1103cb996fe34cb374bc12ac251940"},
-    {file = "xxhash-3.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61004587a09b5b385e43d95ffe3a76c9d934dfd79ea38272d5c20ddfba8eab8f"},
-    {file = "xxhash-3.3.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f0c92e3fa826425c73acafb31e022a719c85423847a9433d3a9e61e4ac97543"},
-    {file = "xxhash-3.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:367e03f1484ce471c94e731b98f5e4a05b43e7188b16692998e1cc89fd1159a5"},
-    {file = "xxhash-3.3.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed04c47dfaab98fcda0b748af9ee6fe8c888a0a0fbd13720e0f0221671e387e1"},
-    {file = "xxhash-3.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cbfde62516435ca198220aff048a8793383cb7047c7b88714a061968bca786d"},
-    {file = "xxhash-3.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73682225faa973ee56743f0fcd36bfcbfec503be258e0e420fb34313f52f1e7b"},
-    {file = "xxhash-3.3.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d49efdce2086c2c506af20ed18a1115b40af7aad6d4ee27cb31d7c810585a3f2"},
-    {file = "xxhash-3.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:546a0bb8e5a657cadf0da290b30ccd561cb89c256a5421ab8d5eb12eaf087349"},
-    {file = "xxhash-3.3.0.tar.gz", hash = "sha256:c3f9e322b1ebeebd44e3d9d2d9b124e0c550c1ef41bd552afdcdd719516ee41a"},
+    {file = "xxhash-3.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:91dbfa55346ad3e18e738742236554531a621042e419b70ad8f3c1d9c7a16e7f"},
+    {file = "xxhash-3.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:665a65c2a48a72068fcc4d21721510df5f51f1142541c890491afc80451636d2"},
+    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb11628470a6004dc71a09fe90c2f459ff03d611376c1debeec2d648f44cb693"},
+    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5bef2a7dc7b4f4beb45a1edbba9b9194c60a43a89598a87f1a0226d183764189"},
+    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0f7b2d547d72c7eda7aa817acf8791f0146b12b9eba1d4432c531fb0352228"},
+    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00f2fdef6b41c9db3d2fc0e7f94cb3db86693e5c45d6de09625caad9a469635b"},
+    {file = "xxhash-3.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23cfd9ca09acaf07a43e5a695143d9a21bf00f5b49b15c07d5388cadf1f9ce11"},
+    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6a9ff50a3cf88355ca4731682c168049af1ca222d1d2925ef7119c1a78e95b3b"},
+    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f1d7c69a1e9ca5faa75546fdd267f214f63f52f12692f9b3a2f6467c9e67d5e7"},
+    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:672b273040d5d5a6864a36287f3514efcd1d4b1b6a7480f294c4b1d1ee1b8de0"},
+    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4178f78d70e88f1c4a89ff1ffe9f43147185930bb962ee3979dba15f2b1cc799"},
+    {file = "xxhash-3.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9804b9eb254d4b8cc83ab5a2002128f7d631dd427aa873c8727dba7f1f0d1c2b"},
+    {file = "xxhash-3.4.1-cp310-cp310-win32.whl", hash = "sha256:c09c49473212d9c87261d22c74370457cfff5db2ddfc7fd1e35c80c31a8c14ce"},
+    {file = "xxhash-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:ebbb1616435b4a194ce3466d7247df23499475c7ed4eb2681a1fa42ff766aff6"},
+    {file = "xxhash-3.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:25dc66be3db54f8a2d136f695b00cfe88018e59ccff0f3b8f545869f376a8a46"},
+    {file = "xxhash-3.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58c49083801885273e262c0f5bbeac23e520564b8357fbb18fb94ff09d3d3ea5"},
+    {file = "xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b526015a973bfbe81e804a586b703f163861da36d186627e27524f5427b0d520"},
+    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36ad4457644c91a966f6fe137d7467636bdc51a6ce10a1d04f365c70d6a16d7e"},
+    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:248d3e83d119770f96003271fe41e049dd4ae52da2feb8f832b7a20e791d2920"},
+    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2070b6d5bbef5ee031666cf21d4953c16e92c2f8a24a94b5c240f8995ba3b1d0"},
+    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2746035f518f0410915e247877f7df43ef3372bf36cfa52cc4bc33e85242641"},
+    {file = "xxhash-3.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ba6181514681c2591840d5632fcf7356ab287d4aff1c8dea20f3c78097088"},
+    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0aac5010869240e95f740de43cd6a05eae180c59edd182ad93bf12ee289484fa"},
+    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4cb11d8debab1626181633d184b2372aaa09825bde709bf927704ed72765bed1"},
+    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b29728cff2c12f3d9f1d940528ee83918d803c0567866e062683f300d1d2eff3"},
+    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a15cbf3a9c40672523bdb6ea97ff74b443406ba0ab9bca10ceccd9546414bd84"},
+    {file = "xxhash-3.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6e66df260fed01ed8ea790c2913271641c58481e807790d9fca8bfd5a3c13844"},
+    {file = "xxhash-3.4.1-cp311-cp311-win32.whl", hash = "sha256:e867f68a8f381ea12858e6d67378c05359d3a53a888913b5f7d35fbf68939d5f"},
+    {file = "xxhash-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:200a5a3ad9c7c0c02ed1484a1d838b63edcf92ff538770ea07456a3732c577f4"},
+    {file = "xxhash-3.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:1d03f1c0d16d24ea032e99f61c552cb2b77d502e545187338bea461fde253583"},
+    {file = "xxhash-3.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c4bbba9b182697a52bc0c9f8ec0ba1acb914b4937cd4a877ad78a3b3eeabefb3"},
+    {file = "xxhash-3.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9fd28a9da300e64e434cfc96567a8387d9a96e824a9be1452a1e7248b7763b78"},
+    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6066d88c9329ab230e18998daec53d819daeee99d003955c8db6fc4971b45ca3"},
+    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93805bc3233ad89abf51772f2ed3355097a5dc74e6080de19706fc447da99cd3"},
+    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64da57d5ed586ebb2ecdde1e997fa37c27fe32fe61a656b77fabbc58e6fbff6e"},
+    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a97322e9a7440bf3c9805cbaac090358b43f650516486746f7fa482672593df"},
+    {file = "xxhash-3.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bbe750d512982ee7d831838a5dee9e9848f3fb440e4734cca3f298228cc957a6"},
+    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:fd79d4087727daf4d5b8afe594b37d611ab95dc8e29fe1a7517320794837eb7d"},
+    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:743612da4071ff9aa4d055f3f111ae5247342931dedb955268954ef7201a71ff"},
+    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:b41edaf05734092f24f48c0958b3c6cbaaa5b7e024880692078c6b1f8247e2fc"},
+    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:a90356ead70d715fe64c30cd0969072de1860e56b78adf7c69d954b43e29d9fa"},
+    {file = "xxhash-3.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ac56eebb364e44c85e1d9e9cc5f6031d78a34f0092fea7fc80478139369a8b4a"},
+    {file = "xxhash-3.4.1-cp312-cp312-win32.whl", hash = "sha256:911035345932a153c427107397c1518f8ce456f93c618dd1c5b54ebb22e73747"},
+    {file = "xxhash-3.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:f31ce76489f8601cc7b8713201ce94b4bd7b7ce90ba3353dccce7e9e1fee71fa"},
+    {file = "xxhash-3.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:b5beb1c6a72fdc7584102f42c4d9df232ee018ddf806e8c90906547dfb43b2da"},
+    {file = "xxhash-3.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6d42b24d1496deb05dee5a24ed510b16de1d6c866c626c2beb11aebf3be278b9"},
+    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b685fab18876b14a8f94813fa2ca80cfb5ab6a85d31d5539b7cd749ce9e3624"},
+    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419ffe34c17ae2df019a4685e8d3934d46b2e0bbe46221ab40b7e04ed9f11137"},
+    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e041ce5714f95251a88670c114b748bca3bf80cc72400e9f23e6d0d59cf2681"},
+    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc860d887c5cb2f524899fb8338e1bb3d5789f75fac179101920d9afddef284b"},
+    {file = "xxhash-3.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:312eba88ffe0a05e332e3a6f9788b73883752be63f8588a6dc1261a3eaaaf2b2"},
+    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e01226b6b6a1ffe4e6bd6d08cfcb3ca708b16f02eb06dd44f3c6e53285f03e4f"},
+    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:9f3025a0d5d8cf406a9313cd0d5789c77433ba2004b1c75439b67678e5136537"},
+    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:6d3472fd4afef2a567d5f14411d94060099901cd8ce9788b22b8c6f13c606a93"},
+    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:43984c0a92f06cac434ad181f329a1445017c33807b7ae4f033878d860a4b0f2"},
+    {file = "xxhash-3.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a55e0506fdb09640a82ec4f44171273eeabf6f371a4ec605633adb2837b5d9d5"},
+    {file = "xxhash-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:faec30437919555b039a8bdbaba49c013043e8f76c999670aef146d33e05b3a0"},
+    {file = "xxhash-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c9e1b646af61f1fc7083bb7b40536be944f1ac67ef5e360bca2d73430186971a"},
+    {file = "xxhash-3.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:961d948b7b1c1b6c08484bbce3d489cdf153e4122c3dfb07c2039621243d8795"},
+    {file = "xxhash-3.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:719a378930504ab159f7b8e20fa2aa1896cde050011af838af7e7e3518dd82de"},
+    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74fb5cb9406ccd7c4dd917f16630d2e5e8cbbb02fc2fca4e559b2a47a64f4940"},
+    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5dab508ac39e0ab988039bc7f962c6ad021acd81fd29145962b068df4148c476"},
+    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c59f3e46e7daf4c589e8e853d700ef6607afa037bfad32c390175da28127e8c"},
+    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cc07256eff0795e0f642df74ad096f8c5d23fe66bc138b83970b50fc7f7f6c5"},
+    {file = "xxhash-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9f749999ed80f3955a4af0eb18bb43993f04939350b07b8dd2f44edc98ffee9"},
+    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7688d7c02149a90a3d46d55b341ab7ad1b4a3f767be2357e211b4e893efbaaf6"},
+    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a8b4977963926f60b0d4f830941c864bed16aa151206c01ad5c531636da5708e"},
+    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8106d88da330f6535a58a8195aa463ef5281a9aa23b04af1848ff715c4398fb4"},
+    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4c76a77dbd169450b61c06fd2d5d436189fc8ab7c1571d39265d4822da16df22"},
+    {file = "xxhash-3.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:11f11357c86d83e53719c592021fd524efa9cf024dc7cb1dfb57bbbd0d8713f2"},
+    {file = "xxhash-3.4.1-cp38-cp38-win32.whl", hash = "sha256:0c786a6cd74e8765c6809892a0d45886e7c3dc54de4985b4a5eb8b630f3b8e3b"},
+    {file = "xxhash-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:aabf37fb8fa27430d50507deeab2ee7b1bcce89910dd10657c38e71fee835594"},
+    {file = "xxhash-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6127813abc1477f3a83529b6bbcfeddc23162cece76fa69aee8f6a8a97720562"},
+    {file = "xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef2e194262f5db16075caea7b3f7f49392242c688412f386d3c7b07c7733a70a"},
+    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71be94265b6c6590f0018bbf73759d21a41c6bda20409782d8117e76cd0dfa8b"},
+    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10e0a619cdd1c0980e25eb04e30fe96cf8f4324758fa497080af9c21a6de573f"},
+    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa122124d2e3bd36581dd78c0efa5f429f5220313479fb1072858188bc2d5ff1"},
+    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17032f5a4fea0a074717fe33477cb5ee723a5f428de7563e75af64bfc1b1e10"},
+    {file = "xxhash-3.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca7783b20e3e4f3f52f093538895863f21d18598f9a48211ad757680c3bd006f"},
+    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d77d09a1113899fad5f354a1eb4f0a9afcf58cefff51082c8ad643ff890e30cf"},
+    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:21287bcdd299fdc3328cc0fbbdeaa46838a1c05391264e51ddb38a3f5b09611f"},
+    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:dfd7a6cc483e20b4ad90224aeb589e64ec0f31e5610ab9957ff4314270b2bf31"},
+    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:543c7fcbc02bbb4840ea9915134e14dc3dc15cbd5a30873a7a5bf66039db97ec"},
+    {file = "xxhash-3.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fe0a98d990e433013f41827b62be9ab43e3cf18e08b1483fcc343bda0d691182"},
+    {file = "xxhash-3.4.1-cp39-cp39-win32.whl", hash = "sha256:b9097af00ebf429cc7c0e7d2fdf28384e4e2e91008130ccda8d5ae653db71e54"},
+    {file = "xxhash-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:d699b921af0dcde50ab18be76c0d832f803034d80470703700cb7df0fbec2832"},
+    {file = "xxhash-3.4.1-cp39-cp39-win_arm64.whl", hash = "sha256:2be491723405e15cc099ade1280133ccfbf6322d2ef568494fb7d07d280e7eee"},
+    {file = "xxhash-3.4.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:431625fad7ab5649368c4849d2b49a83dc711b1f20e1f7f04955aab86cd307bc"},
+    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc6dbd5fc3c9886a9e041848508b7fb65fd82f94cc793253990f81617b61fe49"},
+    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ff8dbd0ec97aec842476cb8ccc3e17dd288cd6ce3c8ef38bff83d6eb927817"},
+    {file = "xxhash-3.4.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef73a53fe90558a4096e3256752268a8bdc0322f4692ed928b6cd7ce06ad4fe3"},
+    {file = "xxhash-3.4.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:450401f42bbd274b519d3d8dcf3c57166913381a3d2664d6609004685039f9d3"},
+    {file = "xxhash-3.4.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a162840cf4de8a7cd8720ff3b4417fbc10001eefdd2d21541a8226bb5556e3bb"},
+    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b736a2a2728ba45017cb67785e03125a79d246462dfa892d023b827007412c52"},
+    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d0ae4c2e7698adef58710d6e7a32ff518b66b98854b1c68e70eee504ad061d8"},
+    {file = "xxhash-3.4.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6322c4291c3ff174dcd104fae41500e75dad12be6f3085d119c2c8a80956c51"},
+    {file = "xxhash-3.4.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:dd59ed668801c3fae282f8f4edadf6dc7784db6d18139b584b6d9677ddde1b6b"},
+    {file = "xxhash-3.4.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92693c487e39523a80474b0394645b393f0ae781d8db3474ccdcead0559ccf45"},
+    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4603a0f642a1e8d7f3ba5c4c25509aca6a9c1cc16f85091004a7028607ead663"},
+    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa45e8cbfbadb40a920fe9ca40c34b393e0b067082d94006f7f64e70c7490a6"},
+    {file = "xxhash-3.4.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:595b252943b3552de491ff51e5bb79660f84f033977f88f6ca1605846637b7c6"},
+    {file = "xxhash-3.4.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:562d8b8f783c6af969806aaacf95b6c7b776929ae26c0cd941d54644ea7ef51e"},
+    {file = "xxhash-3.4.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:41ddeae47cf2828335d8d991f2d2b03b0bdc89289dc64349d712ff8ce59d0647"},
+    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c44d584afdf3c4dbb3277e32321d1a7b01d6071c1992524b6543025fb8f4206f"},
+    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7bddb3a5b86213cc3f2c61500c16945a1b80ecd572f3078ddbbe68f9dabdfb"},
+    {file = "xxhash-3.4.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ecb6c987b62437c2f99c01e97caf8d25660bf541fe79a481d05732e5236719c"},
+    {file = "xxhash-3.4.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:696b4e18b7023527d5c50ed0626ac0520edac45a50ec7cf3fc265cd08b1f4c03"},
+    {file = "xxhash-3.4.1.tar.gz", hash = "sha256:0379d6cf1ff987cd421609a264ce025e74f346e3e145dd106c0cc2e3ec3f99a9"},
 ]
 
 [[package]]
 name = "yarl"
-version = "1.9.2"
+version = "1.9.4"
 description = "Yet another URL library"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"},
-    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"},
-    {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608"},
-    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3"},
-    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528"},
-    {file = "yarl-1.9.2-cp310-cp310-win32.whl", hash = "sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3"},
-    {file = "yarl-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb"},
-    {file = "yarl-1.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7"},
-    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7"},
-    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a"},
-    {file = "yarl-1.9.2-cp311-cp311-win32.whl", hash = "sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8"},
-    {file = "yarl-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051"},
-    {file = "yarl-1.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938"},
-    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04"},
-    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582"},
-    {file = "yarl-1.9.2-cp37-cp37m-win32.whl", hash = "sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b"},
-    {file = "yarl-1.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4"},
-    {file = "yarl-1.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417"},
-    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc"},
-    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b"},
-    {file = "yarl-1.9.2-cp38-cp38-win32.whl", hash = "sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7"},
-    {file = "yarl-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8"},
-    {file = "yarl-1.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955"},
-    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3"},
-    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80"},
-    {file = "yarl-1.9.2-cp39-cp39-win32.whl", hash = "sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623"},
-    {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"},
-    {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
+    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
+    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
+    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
+    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
+    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
+    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
+    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
+    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
+    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
+    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
+    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
+    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
+    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
+    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
+    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
 ]
 
 [package.dependencies]
 idna = ">=2.0"
 multidict = ">=4.0"
 
+[[package]]
+name = "zipp"
+version = "3.19.1"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
+    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
+]
+
+[package.extras]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+
 [extras]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
+outlines = ["outlines"]
+peft = ["peft"]
 quantize = ["accelerate", "datasets", "texttable"]
+torch = ["torch"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "f2ef5f41a172d14985367a385ad6ce844c8c05b2d68d9ddcc11b41f581921c96"
+content-hash = "f62a7a74e1e1bcb3b7cb4f7da2b538065830748062a2b57fdbb4c76eae5abddc"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 1babf749..7b5e83fb 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.0.3"
+version = "2.0.5-dev0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 
@@ -9,36 +9,42 @@ text-generation-server = 'text_generation_server.cli:app'
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.21.7"
+protobuf = "^4.25.3"
 grpcio = "^1.51.1"
 grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
 grpc-interceptor = "^0.15.0"
 typer = "^0.6.1"
-accelerate = { version = "^0.20.0", optional = true }
-bitsandbytes = { version = "^0.41.1", optional = true }
-safetensors = "^0.3.2"
+accelerate = { version = "^0.29.1", optional = true }
+bitsandbytes = { version = "^0.43.0", optional = true }
+safetensors = "^0.4"
 loguru = "^0.6.0"
-opentelemetry-api = "^1.15.0"
-opentelemetry-exporter-otlp = "^1.15.0"
-opentelemetry-instrumentation-grpc = "^0.36b0"
+opentelemetry-api = "^1.25.0"
+opentelemetry-exporter-otlp = "^1.25.0"
+opentelemetry-instrumentation-grpc = "^0.46b0"
 hf-transfer = "^0.1.2"
 sentencepiece = "^0.1.97"
-tokenizers = "^0.13.3"
-huggingface-hub = "^0.16.4"
-transformers = "^4.32.1"
+tokenizers = "^0.19.1"
+huggingface-hub = "^0.23"
+transformers = "^4.41"
 einops = "^0.6.1"
 texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
-peft = "^0.4.0"
-torch = { version = "^2.0.1" }
+peft = { version = "^0.10", optional = true }
+torch = { version = "^2.3.0", optional = true }
 scipy = "^1.11.1"
 pillow = "^10.0.0"
+outlines= { version = "^0.0.34", optional = true }
+prometheus-client = "^0.20.0"
+py-cpuinfo = "^9.0.0"
 
 [tool.poetry.extras]
+torch = ["torch"]
 accelerate = ["accelerate"]
 bnb = ["bitsandbytes"]
+peft = ["peft"]
 quantize = ["texttable", "datasets", "accelerate"]
+outlines = ["outlines"]
 
 [tool.poetry.group.dev.dependencies]
 grpcio-tools = "^1.51.1"
@@ -47,12 +53,14 @@ pytest = "^7.3.0"
 
 [[tool.poetry.source]]
 name = "pytorch-gpu-src"
-url = "https://download.pytorch.org/whl/cu118"
+url = "https://download.pytorch.org/whl/cu121"
 priority = "explicit"
 
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 
 [build-system]
-requires = ["poetry-core>=1.0.0"]
+requires = [
+    "poetry-core>=1.0.0",
+]
 build-backend = "poetry.core.masonry.api"
diff --git a/server/requirements.txt b/server/requirements.txt
deleted file mode 100644
index 1b038cca..00000000
--- a/server/requirements.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-accelerate==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
-aiohttp==3.8.5 ; python_version >= "3.9" and python_version < "3.13"
-aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
-async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.13"
-attrs==23.1.0 ; python_version >= "3.9" and python_version < "3.13"
-backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
-bitsandbytes==0.41.1 ; python_version >= "3.9" and python_version < "3.13"
-certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
-click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
-colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
-datasets==2.14.4 ; python_version >= "3.9" and python_version < "3.13"
-deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
-dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
-einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-filelock==3.12.3 ; python_version >= "3.9" and python_version < "3.13"
-frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
-fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
-grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.57.0 ; python_version >= "3.9" and python_version < "3.13"
-hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
-jinja2==3.1.2 ; python_version >= "3.9" and python_version < "3.13"
-loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
-markupsafe==2.1.3 ; python_version >= "3.9" and python_version < "3.13"
-mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
-multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
-multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
-networkx==3.1 ; python_version >= "3.9" and python_version < "3.13"
-numpy==1.25.2 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
-packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
-pandas==2.0.3 ; python_version >= "3.9" and python_version < "3.13"
-peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
-pillow==10.0.0 ; python_version >= "3.9" and python_version < "3.13"
-protobuf==4.24.2 ; python_version >= "3.9" and python_version < "3.13"
-psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
-pyarrow==13.0.0 ; python_version >= "3.9" and python_version < "3.13"
-python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "3.13"
-pytz==2023.3 ; python_version >= "3.9" and python_version < "3.13"
-pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
-regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
-requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
-safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
-scipy==1.11.2 ; python_version >= "3.9" and python_version < "3.13"
-sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==68.1.2 ; python_version >= "3.9" and python_version < "3.13"
-six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
-sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
-texttable==1.6.7 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
-torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
-tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.32.1 ; python_version >= "3.9" and python_version < "3.13"
-typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "3.13"
-tzdata==2023.3 ; python_version >= "3.9" and python_version < "3.13"
-urllib3==2.0.4 ; python_version >= "3.9" and python_version < "3.13"
-win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
-wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
-xxhash==3.3.0 ; python_version >= "3.9" and python_version < "3.13"
-yarl==1.9.2 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
new file mode 100644
index 00000000..88fcc4f3
--- /dev/null
+++ b/server/requirements_cuda.txt
@@ -0,0 +1,48 @@
+backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
+deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
+einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.0 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
+py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
+typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt
new file mode 100644
index 00000000..5751bf81
--- /dev/null
+++ b/server/requirements_intel.txt
@@ -0,0 +1,48 @@
+backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
+deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
+einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.0 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
+py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==69.5.1 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
+typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
new file mode 100644
index 00000000..88fcc4f3
--- /dev/null
+++ b/server/requirements_rocm.txt
@@ -0,0 +1,48 @@
+backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+certifi==2024.2.2 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
+colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
+deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
+einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+filelock==3.14.0 ; python_version >= "3.9" and python_version < "3.13"
+fsspec==2024.5.0 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.63.0 ; python_version >= "3.9" and python_version < "3.13"
+grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.62.2 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.64.0 ; python_version >= "3.9" and python_version < "3.13"
+hf-transfer==0.1.6 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.23.1 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.7 ; python_version >= "3.9" and python_version < "3.13"
+loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
+numpy==1.26.4 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-exporter-otlp==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation-grpc==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
+opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
+packaging==24.0 ; python_version >= "3.9" and python_version < "3.13"
+pillow==10.3.0 ; python_version >= "3.9" and python_version < "3.13"
+prometheus-client==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+protobuf==4.25.3 ; python_version >= "3.9" and python_version < "3.13"
+py-cpuinfo==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
+pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
+regex==2024.5.15 ; python_version >= "3.9" and python_version < "3.13"
+requests==2.32.2 ; python_version >= "3.9" and python_version < "3.13"
+safetensors==0.4.3 ; python_version >= "3.9" and python_version < "3.13"
+scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
+sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==70.0.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
+tqdm==4.66.4 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.41.1 ; python_version >= "3.9" and python_version < "3.13"
+typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.12.0 ; python_version >= "3.9" and python_version < "3.13"
+urllib3==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
+win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
+wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/tests/models/test_bloom.py b/server/tests/models/test_bloom.py
index 71013cb6..08292920 100644
--- a/server/tests/models/test_bloom.py
+++ b/server/tests/models/test_bloom.py
@@ -8,6 +8,9 @@ from text_generation_server.pb import generate_pb2
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.utils import weight_hub_files, download_weights
 from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
+from text_generation_server.models.custom_modeling.bloom_modeling import (
+    BloomForCausalLM,
+)
 
 
 @pytest.fixture(scope="session")
@@ -16,7 +19,10 @@ def default_bloom():
     revision = "main"
     filenames = weight_hub_files(model_id, revision, ".safetensors")
     download_weights(filenames, model_id, revision)
-    return BLOOMSharded(model_id)
+    return BLOOMSharded(
+        model_id,
+        model_class=BloomForCausalLM,
+    )
 
 
 @pytest.fixture(scope="session")
@@ -29,6 +35,7 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="Test",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
         prefill_logprobs=True,
         truncate=100,
         parameters=default_pb_parameters,
@@ -103,7 +110,7 @@ def test_causal_lm_batch_type(default_bloom):
 
 def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
     sequence_length = len(default_bloom_batch.all_input_ids[0])
-    generations, next_batch = default_bloom.generate_token(default_bloom_batch)
+    generations, next_batch, _ = default_bloom.generate_token(default_bloom_batch)
 
     assert len(generations) == len(default_bloom_batch)
     assert isinstance(next_batch, CausalLMBatch)
@@ -133,18 +140,30 @@ def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
     )
     assert all([generation.generated_text is None for generation in generations])
     assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all([generation.token_id.item() == 10264 for generation in generations])
-    assert all([generation.token_text == "Test" for generation in generations])
+    assert all(
+        [
+            token_id.item() == 10264
+            for generation in generations
+            for token_id in generation.tokens.token_ids
+        ]
+    )
+    assert all(
+        [
+            token_text == "Test"
+            for generation in generations
+            for token_text in generation.tokens.texts
+        ]
+    )
     assert generations[0].request_id == 0
 
 
 def test_causal_lm_generate_token_completion(default_bloom, default_bloom_batch):
     next_batch = default_bloom_batch
     for _ in range(default_bloom_batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch = default_bloom.generate_token(next_batch)
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
         assert len(generations) == len(default_bloom_batch)
 
-    generations, next_batch = default_bloom.generate_token(next_batch)
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -166,10 +185,10 @@ def test_causal_lm_generate_token_completion_multi(
     for i in range(
         default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 1
     ):
-        generations, next_batch = default_bloom.generate_token(next_batch)
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
         assert len(generations) == len(default_multi_requests_bloom_batch)
 
-    generations, next_batch = default_bloom.generate_token(next_batch)
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 2
@@ -189,10 +208,10 @@ def test_causal_lm_generate_token_completion_multi(
     for _ in range(
         stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
     ):
-        generations, next_batch = default_bloom.generate_token(next_batch)
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_bloom.generate_token(next_batch)
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -212,11 +231,11 @@ def test_batch_concatenate(
     default_bloom, default_bloom_batch, default_multi_requests_bloom_batch
 ):
     next_batch_0 = default_bloom_batch
-    _, next_batch_0 = default_bloom.generate_token(next_batch_0)
-    _, next_batch_0 = default_bloom.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_bloom.generate_token(next_batch_0)
 
     next_batch_1 = default_multi_requests_bloom_batch
-    _, next_batch_1 = default_bloom.generate_token(next_batch_1)
+    _, next_batch_1, _ = default_bloom.generate_token(next_batch_1)
 
     # Clone past_key_values before concatenating to compare after,
     # because they are removed from the concatenated batches
@@ -276,10 +295,10 @@ def test_batch_concatenate(
     for _ in range(
         default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens - 2
     ):
-        generations, next_batch = default_bloom.generate_token(next_batch)
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_bloom.generate_token(next_batch)
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 3
@@ -301,10 +320,10 @@ def test_batch_concatenate(
         - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
         - 2
     ):
-        generations, next_batch = default_bloom.generate_token(next_batch)
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_bloom.generate_token(next_batch)
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 2
@@ -325,10 +344,10 @@ def test_batch_concatenate(
         - default_multi_requests_bloom_batch.stopping_criterias[1].max_new_tokens
         - 4
     ):
-        generations, next_batch = default_bloom.generate_token(next_batch)
+        generations, next_batch, _ = default_bloom.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_bloom.generate_token(next_batch)
+    generations, next_batch, _ = default_bloom.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
diff --git a/server/tests/models/test_causal_lm.py b/server/tests/models/test_causal_lm.py
index 0f9dab2c..c000ef26 100644
--- a/server/tests/models/test_causal_lm.py
+++ b/server/tests/models/test_causal_lm.py
@@ -10,7 +10,7 @@ from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
 
 @pytest.fixture(scope="session")
 def default_causal_lm():
-    return CausalLM("gpt2")
+    return CausalLM.fallback("gpt2")
 
 
 @pytest.fixture(scope="session")
@@ -25,6 +25,7 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="Test",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
         prefill_logprobs=True,
         truncate=100,
         parameters=default_pb_parameters,
@@ -99,7 +100,9 @@ def test_causal_lm_batch_type(default_causal_lm):
 
 def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
     sequence_length = len(default_causal_lm_batch.all_input_ids[0])
-    generations, next_batch = default_causal_lm.generate_token(default_causal_lm_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(
+        default_causal_lm_batch
+    )
 
     assert len(generations) == len(next_batch)
     assert isinstance(next_batch, CausalLMBatch)
@@ -129,8 +132,20 @@ def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
     )
     assert all([generation.generated_text is None for generation in generations])
     assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all([generation.token_id.item() == 13 for generation in generations])
-    assert all([generation.token_text == "." for generation in generations])
+    assert all(
+        [
+            token_id.item() == 13
+            for generation in generations
+            for token_id in generation.tokens.token_ids
+        ]
+    )
+    assert all(
+        [
+            token_text == "."
+            for generation in generations
+            for token_text in generation.tokens.texts
+        ]
+    )
     assert generations[0].request_id == 0
 
 
@@ -139,10 +154,10 @@ def test_causal_lm_generate_token_completion(
 ):
     next_batch = default_causal_lm_batch
     for _ in range(default_causal_lm_batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch = default_causal_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_causal_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -162,10 +177,10 @@ def test_causal_lm_generate_token_completion_multi(
     for i in range(
         default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 1
     ):
-        generations, next_batch = default_causal_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_causal_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 2
@@ -188,10 +203,10 @@ def test_causal_lm_generate_token_completion_multi(
     for _ in range(
         stopping_criterias[0].max_new_tokens - stopping_criterias[1].max_new_tokens - 1
     ):
-        generations, next_batch = default_causal_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_causal_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -210,11 +225,11 @@ def test_batch_concatenate(
     default_causal_lm, default_causal_lm_batch, default_multi_requests_causal_lm_batch
 ):
     next_batch_0 = default_causal_lm_batch
-    _, next_batch_0 = default_causal_lm.generate_token(next_batch_0)
-    _, next_batch_0 = default_causal_lm.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_causal_lm.generate_token(next_batch_0)
 
     next_batch_1 = default_multi_requests_causal_lm_batch
-    _, next_batch_1 = default_causal_lm.generate_token(next_batch_1)
+    _, next_batch_1, _ = default_causal_lm.generate_token(next_batch_1)
 
     # Clone past_key_values before concatenating to compare after,
     # because they are removed from the concatenated batches
@@ -273,10 +288,10 @@ def test_batch_concatenate(
     for _ in range(
         default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens - 2
     ):
-        generations, next_batch = default_causal_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_causal_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 3
@@ -299,10 +314,10 @@ def test_batch_concatenate(
         - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
         - 2
     ):
-        generations, next_batch = default_causal_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_causal_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 2
@@ -321,10 +336,10 @@ def test_batch_concatenate(
         - default_multi_requests_causal_lm_batch.stopping_criterias[1].max_new_tokens
         - 4
     ):
-        generations, next_batch = default_causal_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_causal_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_causal_lm.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
diff --git a/server/tests/models/test_model.py b/server/tests/models/test_model.py
index 32bcd45f..8441e8c6 100644
--- a/server/tests/models/test_model.py
+++ b/server/tests/models/test_model.py
@@ -17,7 +17,12 @@ def get_test_model():
     tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")
 
     model = TestModel(
-        torch.nn.Linear(1, 1), tokenizer, False, torch.float32, torch.device("cpu")
+        "test_model_id",
+        torch.nn.Linear(1, 1),
+        tokenizer,
+        False,
+        torch.float32,
+        torch.device("cpu"),
     )
     return model
 
diff --git a/server/tests/models/test_santacoder.py b/server/tests/models/test_santacoder.py
index fceec560..d5c91bff 100644
--- a/server/tests/models/test_santacoder.py
+++ b/server/tests/models/test_santacoder.py
@@ -1,13 +1,12 @@
 import pytest
 
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch
-from text_generation_server.models.santacoder import SantaCoder
+from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM
 
 
 @pytest.fixture(scope="session")
 def default_santacoder():
-    return SantaCoder("bigcode/santacoder")
+    return CausalLM.fallback(model_id="bigcode/santacoder")
 
 
 @pytest.fixture
@@ -15,6 +14,7 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="def",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="def")]),
         prefill_logprobs=True,
         truncate=100,
         parameters=default_pb_parameters,
@@ -32,6 +32,13 @@ def default_fim_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="<fim-prefix>def<fim-suffix>world<fim-middle>",
+        input_chunks=generate_pb2.Input(
+            chunks=[
+                generate_pb2.InputChunk(
+                    text="<fim-prefix>def<fim-suffix>world<fim-middle>"
+                )
+            ]
+        ),
         prefill_logprobs=True,
         truncate=100,
         parameters=default_pb_parameters,
@@ -55,10 +62,10 @@ def test_santacoder_generate_token_completion(default_santacoder, default_pb_bat
     next_batch = batch
 
     for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch = default_santacoder.generate_token(next_batch)
+        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_santacoder.generate_token(next_batch)
+    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -83,10 +90,10 @@ def test_fim_santacoder_generate_token_completion(
     next_batch = batch
 
     for _ in range(batch.stopping_criterias[0].max_new_tokens - 1):
-        generations, next_batch = default_santacoder.generate_token(next_batch)
+        generations, next_batch, _ = default_santacoder.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_santacoder.generate_token(next_batch)
+    generations, next_batch, _ = default_santacoder.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
diff --git a/server/tests/models/test_seq2seq_lm.py b/server/tests/models/test_seq2seq_lm.py
index 299340f8..02666042 100644
--- a/server/tests/models/test_seq2seq_lm.py
+++ b/server/tests/models/test_seq2seq_lm.py
@@ -20,7 +20,7 @@ def mt0_small_tokenizer():
 
 @pytest.fixture(scope="session")
 def default_seq2seq_lm():
-    return Seq2SeqLM("bigscience/mt0-small")
+    return Seq2SeqLM.fallback("bigscience/mt0-small")
 
 
 @pytest.fixture
@@ -28,6 +28,7 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="Test",
+        input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]),
         prefill_logprobs=True,
         truncate=100,
         parameters=default_pb_parameters,
@@ -103,7 +104,7 @@ def test_seq2seq_lm_batch_type(default_seq2seq_lm):
 
 def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch):
     sequence_length = len(default_seq2seq_lm_batch.input_ids[0])
-    generations, next_batch = default_seq2seq_lm.generate_token(
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(
         default_seq2seq_lm_batch
     )
 
@@ -151,8 +152,20 @@ def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch)
     )
     assert all([generation.generated_text is None for generation in generations])
     assert all([len(generation.prefill_tokens) == 1 for generation in generations])
-    assert all([generation.token_id.item() == 259 for generation in generations])
-    assert all([generation.token_text == " " for generation in generations])
+    assert all(
+        [
+            token_id.item() == 259
+            for generation in generations
+            for token_id in generation.tokens.token_ids
+        ]
+    )
+    assert all(
+        [
+            token_text == " "
+            for generation in generations
+            for token_text in generation.tokens.texts
+        ]
+    )
     assert generations[0].request_id == 0
 
 
@@ -161,10 +174,10 @@ def test_seq2seq_lm_generate_token_completion(
 ):
     next_batch = default_seq2seq_lm_batch
     for _ in range(6):
-        generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -179,10 +192,10 @@ def test_seq2seq_lm_generate_token_completion_multi(
     next_batch = default_multi_requests_seq2seq_lm_batch
 
     for i in range(4):
-        generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 2
@@ -195,10 +208,10 @@ def test_seq2seq_lm_generate_token_completion_multi(
 
     next_batch = next_batch.filter([next_batch.requests[0].id])
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
@@ -216,11 +229,11 @@ def test_batch_concatenate(
     default_multi_requests_seq2seq_lm_batch,
 ):
     next_batch_0 = default_seq2seq_lm_batch
-    _, next_batch_0 = default_seq2seq_lm.generate_token(next_batch_0)
-    _, next_batch_0 = default_seq2seq_lm.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
+    _, next_batch_0, _ = default_seq2seq_lm.generate_token(next_batch_0)
 
     next_batch_1 = default_multi_requests_seq2seq_lm_batch
-    _, next_batch_1 = default_seq2seq_lm.generate_token(next_batch_1)
+    _, next_batch_1, _ = default_seq2seq_lm.generate_token(next_batch_1)
 
     # Copy hidden state because it is removed from the concatenated branches
     next_batch_0_encoder_last_hidden_state = next_batch_0.encoder_last_hidden_state
@@ -312,10 +325,10 @@ def test_batch_concatenate(
         )
 
     for _ in range(3):
-        generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+        generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
         assert len(generations) == len(next_batch)
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 3
@@ -330,7 +343,7 @@ def test_batch_concatenate(
         [next_batch.requests[0].id, next_batch.requests[1].id]
     )
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert next_batch is not None
 
     assert len(generations) == 2
@@ -340,7 +353,7 @@ def test_batch_concatenate(
 
     next_batch = next_batch.filter([next_batch.requests[1].id])
 
-    generations, next_batch = default_seq2seq_lm.generate_token(next_batch)
+    generations, next_batch, _ = default_seq2seq_lm.generate_token(next_batch)
     assert next_batch is None
 
     assert len(generations) == 1
diff --git a/server/tests/utils/test_hub.py b/server/tests/utils/test_hub.py
index fac9a64d..721820f5 100644
--- a/server/tests/utils/test_hub.py
+++ b/server/tests/utils/test_hub.py
@@ -1,5 +1,13 @@
+import os
+import requests
+import tempfile
+
 import pytest
 
+import huggingface_hub.constants
+from huggingface_hub import hf_api
+
+import text_generation_server.utils.hub
 from text_generation_server.utils.hub import (
     weight_hub_files,
     download_weights,
@@ -10,6 +18,60 @@ from text_generation_server.utils.hub import (
 )
 
 
+@pytest.fixture()
+def offline():
+    current_value = text_generation_server.utils.hub.HF_HUB_OFFLINE
+    text_generation_server.utils.hub.HF_HUB_OFFLINE = True
+    yield "offline"
+    text_generation_server.utils.hub.HF_HUB_OFFLINE = current_value
+
+
+@pytest.fixture()
+def fresh_cache():
+    with tempfile.TemporaryDirectory() as d:
+        current_value = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE
+        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = d
+        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = d
+        os.environ["HUGGINGFACE_HUB_CACHE"] = d
+        yield
+        huggingface_hub.constants.HUGGINGFACE_HUB_CACHE = current_value
+        os.environ["HUGGINGFACE_HUB_CACHE"] = current_value
+        text_generation_server.utils.hub.HUGGINGFACE_HUB_CACHE = current_value
+
+
+@pytest.fixture()
+def prefetched():
+    model_id = "bert-base-uncased"
+    huggingface_hub.snapshot_download(
+        repo_id=model_id,
+        revision="main",
+        local_files_only=False,
+        repo_type="model",
+        allow_patterns=["*.safetensors"],
+    )
+    yield model_id
+
+
+def test_weight_hub_files_offline_error(offline, fresh_cache):
+    # If the model is not prefetched then it will raise an error
+    with pytest.raises(EntryNotFoundError):
+        weight_hub_files("gpt2")
+
+
+def test_weight_hub_files_offline_ok(prefetched, offline):
+    # If the model is prefetched then we should be able to get the weight files from local cache
+    filenames = weight_hub_files(prefetched)
+    root = None
+    assert len(filenames) == 1
+    for f in filenames:
+        curroot, filename = os.path.split(f)
+        if root is None:
+            root = curroot
+        else:
+            assert root == curroot
+        assert filename == "model.safetensors"
+
+
 def test_weight_hub_files():
     filenames = weight_hub_files("bigscience/bloom-560m")
     assert filenames == ["model.safetensors"]
@@ -33,8 +95,11 @@ def test_download_weights():
     assert files == local_files
 
 
-def test_weight_files_error():
+def test_weight_files_revision_error():
     with pytest.raises(RevisionNotFoundError):
         weight_files("bigscience/bloom-560m", revision="error")
+
+
+def test_weight_files_not_cached_error(fresh_cache):
     with pytest.raises(LocalEntryNotFoundError):
         weight_files("bert-base-uncased")
diff --git a/server/tests/utils/test_layers.py b/server/tests/utils/test_layers.py
new file mode 100644
index 00000000..9a8da0d6
--- /dev/null
+++ b/server/tests/utils/test_layers.py
@@ -0,0 +1,77 @@
+import torch
+from text_generation_server.layers import (
+    TensorParallelEmbedding,
+)
+
+
+class ProcessGroup:
+    def __init__(self, rank: int, world_size: int):
+        self._rank = rank
+        self.world_size = world_size
+
+    def size(self) -> int:
+        return self.world_size
+
+    def rank(self) -> int:
+        return self._rank
+
+
+class Weights:
+    def __init__(self, rank: int, world_size: int, vocab_size: int, hidden_dim: int):
+        self.weight = (
+            torch.arange(vocab_size * hidden_dim).float().view(vocab_size, hidden_dim)
+        )
+        self.process_group = ProcessGroup(rank, world_size)
+
+    def get_partial_sharded(self, name: str, dim: int):
+        assert dim == 0
+
+        rank = self.process_group.rank()
+        world_size = self.process_group.size()
+        size = self.weight.shape[dim]
+
+        block_size = (size + world_size - 1) // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        return self.weight[start:stop]
+
+    def get_shape(self, name: str):
+        return self.weight.shape
+
+
+def test_weight_hub_files_offline_error():
+
+    vocab_size = 17
+    weights = Weights(rank=0, world_size=1, vocab_size=vocab_size, hidden_dim=256)
+    embeddings = TensorParallelEmbedding("", weights)
+
+    input_ids = torch.arange(vocab_size)
+    output = embeddings.forward(input_ids)
+    assert embeddings.min_id == 0
+    assert embeddings.max_id == 17
+    torch.testing.assert_close(output, torch.arange(256 * 17).float().view(17, 256))
+
+    weights_0_2 = Weights(rank=0, world_size=2, vocab_size=vocab_size, hidden_dim=256)
+    weights_1_2 = Weights(rank=1, world_size=2, vocab_size=vocab_size, hidden_dim=256)
+    embeddings_0_2 = TensorParallelEmbedding("", weights_0_2, reduce=False)
+    assert embeddings_0_2.min_id == 0
+    assert embeddings_0_2.max_id == 9
+    torch.testing.assert_close(
+        embeddings_0_2.weight,
+        torch.cat([torch.arange(9 * 256), torch.zeros(256)], dim=0)
+        .view(10, 256)
+        .float(),
+    )
+    embeddings_1_2 = TensorParallelEmbedding("", weights_1_2, reduce=False)
+    assert embeddings_1_2.min_id == 9
+    assert embeddings_1_2.max_id == 17
+    torch.testing.assert_close(
+        embeddings_1_2.weight,
+        torch.cat([torch.arange(8 * 256) + 9 * 256, torch.zeros(256)], dim=0)
+        .view(9, 256)
+        .float(),
+    )
+    output_tp_0 = embeddings_0_2.forward(input_ids)
+    output_tp_1 = embeddings_1_2.forward(input_ids)
+
+    torch.testing.assert_close(output, output_tp_0 + output_tp_1)
diff --git a/server/tests/utils/test_tokens.py b/server/tests/utils/test_tokens.py
index 4187ff25..5db32776 100644
--- a/server/tests/utils/test_tokens.py
+++ b/server/tests/utils/test_tokens.py
@@ -45,21 +45,44 @@ def test_stopping_criteria_max():
     assert criteria(1, "") == (False, None)
     assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
 
+
 def test_batch_top_tokens():
     top_n_tokens = [0, 2, 3, 4, 5]
     top_n_tokens_tensor = torch.tensor(top_n_tokens)
-    inp_logprobs = torch.tensor([[-1., -3., -4., -2., -3.]] * 5)
+    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5)
+    accepted_ids = torch.ones_like(top_n_tokens_tensor)
 
-    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(top_n_tokens, top_n_tokens_tensor, inp_logprobs)
+    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
+        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
+    )
 
-    assert topn_tok_ids[0] == []
-    assert topn_tok_ids[1] == [0, 3]
-    assert topn_tok_ids[2] == [0, 3, 1, 4]
-    assert topn_tok_ids[3] == [0, 3, 1, 4]
-    assert topn_tok_ids[4] == [0, 3, 1, 4, 2]
+    assert topn_tok_ids[0] == [[]]
+    assert topn_tok_ids[1] == [[0, 3]]
+    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
 
-    assert topn_tok_logprobs[0] == []
-    assert topn_tok_logprobs[1] == [-1, -2]
-    assert topn_tok_logprobs[2] == [-1, -2, -3, -3]
-    assert topn_tok_logprobs[3] == [-1, -2, -3, -3]
-    assert topn_tok_logprobs[4] == [-1, -2, -3, -3, -4]
+    assert topn_tok_logprobs[0] == [[]]
+    assert topn_tok_logprobs[1] == [[-1, -2]]
+    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
+
+    # Now let's make second member of the batch be speculated
+    inp_logprobs = torch.tensor([[-1.0, -3.0, -4.0, -2.0, -3.0]] * 5 * 2)
+    accepted_ids[1] = 2
+    topn_tok_ids, topn_tok_logprobs = batch_top_tokens(
+        top_n_tokens, top_n_tokens_tensor, inp_logprobs, accepted_ids
+    )
+
+    assert topn_tok_ids[0] == [[]]
+    assert topn_tok_ids[1] == [[0, 3], [0, 3]]
+    assert topn_tok_ids[2] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[3] == [[0, 3, 1, 4]]
+    assert topn_tok_ids[4] == [[0, 3, 1, 4, 2]]
+
+    assert topn_tok_logprobs[0] == [[]]
+    assert topn_tok_logprobs[1] == [[-1, -2], [-1, -2]]
+    assert topn_tok_logprobs[2] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[3] == [[-1, -2, -3, -3]]
+    assert topn_tok_logprobs[4] == [[-1, -2, -3, -3, -4]]
diff --git a/server/tests/utils/test_weights.py b/server/tests/utils/test_weights.py
new file mode 100644
index 00000000..8f88b1f8
--- /dev/null
+++ b/server/tests/utils/test_weights.py
@@ -0,0 +1,1152 @@
+import pytest
+import torch
+from text_generation_server.utils.weights import Weights
+from text_generation_server.layers.gptq import GPTQWeight
+from text_generation_server.layers.exl2 import Exl2Weight
+from text_generation_server.layers.marlin import MarlinWeight
+from types import SimpleNamespace
+from typing import List, Optional, Dict, Union
+from pathlib import Path
+
+dummy_file_system = {
+    "test_weights": {
+        "layer.0.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_weights_2": {
+        "layer.1337.weight": torch.tensor(
+            [
+                [1, 2, 3, 4],
+                [5, 6, 7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_weights_col_packed": {
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_multi_weights_col": {
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_multi_weights_row": {
+        "weight.weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    },
+    "test_get_weights_col_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_weights_col_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    },
+    "test_get_multi_weights_row_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_multi_weights_col_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_weights_col_packed_gptq": {
+        "weight.qweight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.g_idx": torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        "weight.qzeros": torch.tensor(
+            [
+                [0, 1],
+                [1, 0],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.scales": torch.tensor(
+            [
+                [100.0, 100.0],
+                [100.0, 100.0],
+            ],
+            dtype=torch.float16,
+        ),
+        "gptq_bits": torch.tensor([8], dtype=torch.float32),
+        "gptq_groupsize": torch.tensor([2], dtype=torch.float32),
+    },
+    "test_get_weights_col_packed_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_multi_weights_row_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_multi_weights_col_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_weights_col_exl2": {
+        "weight.q_weight": torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.int32,
+        ),
+        "weight.q_scale": torch.tensor([8], dtype=torch.int32),
+        "weight.q_invperm": torch.tensor([1, 0, 3, 2], dtype=torch.int32),
+        "weight.q_scale_max": torch.tensor([100], dtype=torch.float16),
+        "weight.q_groups": torch.tensor([4], dtype=torch.int16),
+    },
+    "test_get_multi_weights_row_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
+    },
+    "test_get_multi_weights_col_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
+    },
+    "test_get_weights_col_packed_marlin": {
+        "weight.B": torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        "weight.s": torch.tensor([[0.5], [0.25]], dtype=torch.float16),
+    },
+}
+
+
+class MockSlice:
+    def __init__(self, tensor):
+        self.tensor = tensor
+
+    def get_shape(self):
+        return self.tensor.shape
+
+    def __getitem__(self, idx):
+        return self.tensor[idx]
+
+
+def mock_get_slice(tensor_name, filename):
+    tensor = dummy_file_system[filename][tensor_name]
+    return MockSlice(tensor)
+
+
+def mock_handle(filename, device, dtype):
+    return SimpleNamespace(
+        get_slice=lambda tensor_name: mock_get_slice(tensor_name, filename)
+    )
+
+
+class MockSafeOpen:
+    def __init__(self, filename, framework, dummy_fs):
+        self.filename = filename
+        self.framework = framework
+        self.dummy_fs = dummy_fs
+
+    def keys(self):
+        return list(self.dummy_fs[self.filename].keys())
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class MockWeights(Weights):
+    def __init__(
+        self,
+        filenames: List[Union[Path, str]],
+        device,
+        dtype,
+        process_group,
+        dummy_fs,
+        aliases: Optional[Dict[str, List[str]]] = None,
+        prefix: Optional[str] = None,
+    ):
+        routing = {}
+        self.dummy_fs = dummy_fs
+        for filename in filenames:
+            with MockSafeOpen(filename, framework="pytorch", dummy_fs=dummy_fs) as f:
+                for k in f.keys():
+                    if k in routing:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+        if aliases is None:
+            aliases = {}
+        self.aliases = aliases
+        self.routing = routing
+        self.device = device
+        self.dtype = dtype
+        self.process_group = process_group
+        self.prefix = prefix
+        self._handles = {}
+
+    def _get_handle(self, filename: Union[Path, str]):
+        if filename in self._handles:
+            return self._handles[filename]
+        else:
+            handle = mock_handle(filename, self.device, self.dtype)
+            self._handles[filename] = handle
+            return handle
+
+    def get_shape(self, tensor_name: str):
+        filename, _ = self.get_filename(tensor_name)
+        handle = self._get_handle(filename)
+        return handle.get_slice(tensor_name).get_shape()
+
+    def get_tensor(self, tensor_name: str):
+        filename, _ = self.get_filename(tensor_name)
+        handle = self._get_handle(filename)
+        return handle.get_slice(tensor_name).tensor
+
+
+dummy_process_group = SimpleNamespace(rank=lambda: 0, size=lambda: 1)
+
+
+def test_weights():
+    weights = MockWeights(
+        [
+            "test_weights",
+            "test_weights_2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+    assert weights.get_shape("layer.0.weight") == (2, 2)
+    assert weights.get_tensor("layer.1337.weight").shape == (2, 4)
+
+
+def test_get_tensor():
+    weights = MockWeights(
+        [
+            "test_weights",
+            "test_weights_2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+    assert torch.allclose(
+        weights.get_tensor("layer.0.weight"),
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+    assert torch.allclose(
+        weights.get_tensor("layer.1337.weight"),
+        torch.tensor(
+            [
+                [1, 2, 3, 4],
+                [5, 6, 7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_col_packed():
+
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = None
+    block_sizes = 1
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        quantize=quantize,
+        block_sizes=block_sizes,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_col_packed_block_size():
+
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = None
+    block_sizes = 2
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        quantize=quantize,
+        block_sizes=block_sizes,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_weights_col_packed_block_size_arr():
+
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = None
+    block_sizes = [1, 1]
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        quantize=quantize,
+        block_sizes=block_sizes,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_multi_weights_col():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefixes = ["weight", "weight"]
+    quantize = None
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        quantize=quantize,
+        dim=0,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+                [1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8],
+            ],
+            dtype=torch.float32,
+        ),
+    )
+
+
+def test_get_multi_weights_row():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_row",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = None
+
+    w = weights.get_multi_weights_row(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    assert torch.allclose(
+        w,
+        torch.tensor(
+            [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]],
+            dtype=torch.float32,
+        ),
+    )
+
+
+# test_get_weights_col
+
+
+def test_get_weights_col_awq():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "awq"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor(
+            [[100.0, 100.0], [100.0, 100.0]],
+            dtype=torch.float16,
+        ),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_col_gtpq():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "gptq"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_col_exl2():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "exl2"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    scaled_scale_max = 0.3906 * 256
+    expected_weight = Exl2Weight(
+        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        q_scale=torch.tensor([8], dtype=torch.int32),
+        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
+        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
+        q_groups=torch.tensor([4], dtype=torch.int16),
+    )
+
+    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
+    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
+    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
+    assert torch.allclose(
+        w.q_scale_max, expected_weight.q_scale_max
+    ), "q_scale_max mismatch"
+    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
+
+
+def test_get_weights_col_marlin():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "marlin"
+
+    w = weights.get_weights_col(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
+
+
+# test_get_weights_col_packed
+
+
+def test_get_weights_col_packed_awq():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "awq"
+    block_sizes = 1
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        quantize=quantize,
+        block_sizes=block_sizes,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+@pytest.mark.skip(reason="Review expected functionality")
+def test_get_weights_col_packed_exl2():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "exl2"
+    block_sizes = 1
+
+    w = weights.get_weights_col_packed(
+        prefix=prefix,
+        quantize=quantize,
+        block_sizes=block_sizes,
+    )
+
+    scaled_scale_max = 0.3906 * 256
+    expected_weight = Exl2Weight(
+        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        q_scale=torch.tensor([8], dtype=torch.int32),
+        q_invperm=torch.tensor([1], dtype=torch.int16),
+        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
+        q_groups=torch.tensor([4], dtype=torch.int16),
+    )
+
+    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
+    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
+    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
+    assert torch.allclose(
+        w.q_scale_max, expected_weight.q_scale_max
+    ), "q_scale_max mismatch"
+    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
+
+
+def test_get_weights_col_packed_gptq():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefixes = ["weight"]
+    quantize = "gptq"
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        quantize=quantize,
+        dim=0,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_weights_col_packed_marlin():
+    weights = MockWeights(
+        [
+            "test_get_weights_col_packed_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "marlin"
+
+    w = weights.get_multi_weights_col(
+        prefixes=[prefix],
+        quantize=quantize,
+        dim=0,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    print(expected_weight)
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
+
+
+# test_get_multi_weights_col
+
+
+def test_get_multi_weights_col_awq():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefixes = ["weight"]
+    quantize = "awq"
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        quantize=quantize,
+        dim=0,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_multi_weights_col_exl2():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "exl2"
+
+    try:
+        w = weights.get_multi_weights_col(
+            prefixes=[prefix],
+            quantize=quantize,
+            dim=0,
+        )
+    except ValueError as e:
+        assert e.args[0] == "get_multi_weights_col is not supported for exl2"
+
+
+def test_get_multi_weights_col_gptq():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefixes = ["weight"]
+    quantize = "gptq"
+
+    w = weights.get_multi_weights_col(
+        prefixes=prefixes,
+        quantize=quantize,
+        dim=0,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_multi_weights_col_marlin():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_col_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "marlin"
+
+    w = weights.get_multi_weights_col(
+        prefixes=[prefix],
+        quantize=quantize,
+        dim=0,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
+
+
+# test_get_multi_weights_row
+
+
+def test_get_multi_weights_row_awq():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_row_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "awq"
+
+    w = weights.get_multi_weights_row(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=None,
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert w.g_idx == expected_weight.g_idx, "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_multi_weights_row_exl2():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_row_exl2",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "exl2"
+
+    w = weights.get_multi_weights_row(
+        prefix=prefix,
+        quantize=quantize,
+    )
+    print(w)
+
+    scaled_scale_max = 0.3906 * 256
+    expected_weight = Exl2Weight(
+        q_weight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        q_scale=torch.tensor([8], dtype=torch.int32),
+        q_invperm=torch.tensor([1, 0, 3, 2], dtype=torch.int16),
+        q_scale_max=torch.tensor([scaled_scale_max], dtype=torch.float16),
+        q_groups=torch.tensor([4], dtype=torch.int16),
+    )
+
+    assert torch.allclose(w.q_weight, expected_weight.q_weight), "q_weight mismatch"
+    assert torch.allclose(w.q_scale, expected_weight.q_scale), "q_scale mismatch"
+    assert torch.allclose(w.q_invperm, expected_weight.q_invperm), "q_invperm mismatch"
+    assert torch.allclose(
+        w.q_scale_max, expected_weight.q_scale_max
+    ), "q_scale_max mismatch"
+    assert torch.allclose(w.q_groups, expected_weight.q_groups), "q_groups mismatch"
+
+
+def test_get_multi_weights_row_gptq():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_row_gptq",
+        ],
+        device="cpu",
+        dtype=torch.float32,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "gptq"
+
+    w = weights.get_multi_weights_row(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    expected_weight = GPTQWeight(
+        qweight=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=torch.int32),
+        qzeros=torch.tensor([[0, 1], [1, 0]], dtype=torch.int32),
+        scales=torch.tensor([[100.0, 100.0], [100.0, 100.0]], dtype=torch.float16),
+        g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32),
+        bits=8.0,
+        groupsize=2.0,
+        use_exllama=False,
+    )
+
+    assert torch.allclose(w.qweight, expected_weight.qweight), "qweight mismatch"
+    assert torch.allclose(w.qzeros, expected_weight.qzeros), "qzeros mismatch"
+    assert torch.allclose(w.scales, expected_weight.scales), "scales mismatch"
+    assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch"
+    assert w.bits == expected_weight.bits, "bits mismatch"
+    assert w.groupsize == expected_weight.groupsize, "groupsize mismatch"
+    assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch"
+
+
+def test_get_multi_weights_row_marlin():
+    weights = MockWeights(
+        [
+            "test_get_multi_weights_row_marlin",
+        ],
+        device="cpu",
+        dtype=torch.float16,
+        process_group=dummy_process_group,
+        dummy_fs=dummy_file_system,
+    )
+
+    prefix = "weight"
+    quantize = "marlin"
+
+    w = weights.get_multi_weights_row(
+        prefix=prefix,
+        quantize=quantize,
+    )
+
+    expected_weight = MarlinWeight(
+        B=torch.tensor([[1, 2], [3, 4]], dtype=torch.int32),
+        s=torch.tensor([[0.5000], [0.2500]], dtype=torch.float16),
+    )
+
+    assert torch.allclose(w.B, expected_weight.B), "B mismatch"
+    assert torch.allclose(w.s, expected_weight.s), "s mismatch"
diff --git a/server/text_generation_server/adapters/__init__.py b/server/text_generation_server/adapters/__init__.py
new file mode 100644
index 00000000..8697cb9e
--- /dev/null
+++ b/server/text_generation_server/adapters/__init__.py
@@ -0,0 +1,13 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/__init__.py
+# License:  Apache License Version 2.0, January 2004
+
+from text_generation_server.adapters.weights import (
+    AdapterBatchData,
+    AdapterBatchMetadata,
+)
+
+__all__ = [
+    "AdapterBatchData",
+    "AdapterBatchMetadata",
+]
diff --git a/server/text_generation_server/adapters/config.py b/server/text_generation_server/adapters/config.py
new file mode 100644
index 00000000..5261d4b5
--- /dev/null
+++ b/server/text_generation_server/adapters/config.py
@@ -0,0 +1,44 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/config.py
+# License:  Apache License Version 2.0, January 2004
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, Optional, Set, Tuple
+
+import torch
+
+from text_generation_server.adapters.weights import AdapterWeights
+
+if TYPE_CHECKING:
+    from text_generation_server.models.model import Model
+
+
+@dataclass
+class ModuleMap:
+    module_name: str
+    module_weights: Dict[str, Tuple[torch.Tensor, str]]
+
+
+@dataclass
+class AdapterConfig(ABC):
+    base_model_name_or_path: str
+
+    @abstractmethod
+    def map_weights_for_model(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        weight_names: Tuple[str],
+    ) -> Tuple[ModuleMap, Set[str]]:
+        pass
+
+    @abstractmethod
+    def load_batched_adapter_weights(
+        self,
+        model: "Model",
+        module_map: ModuleMap,
+        layer_type: str,
+        unused_weight_names: Set[str],
+        dynamic: bool,
+    ) -> Optional[AdapterWeights]:
+        pass
diff --git a/server/text_generation_server/adapters/lora.py b/server/text_generation_server/adapters/lora.py
new file mode 100644
index 00000000..87543be2
--- /dev/null
+++ b/server/text_generation_server/adapters/lora.py
@@ -0,0 +1,482 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/lora.py
+# License:  Apache License Version 2.0, January 2004
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Type, Union
+
+import torch
+from peft import LoraConfig as _LoraConfig
+from torch.distributed import ProcessGroup
+
+from text_generation_server.adapters.config import AdapterConfig, ModuleMap
+
+from text_generation_server.adapters.weights import (
+    AdapterBatchMetadata,
+    AdapterWeights,
+    BatchAdapterWeights,
+)
+from text_generation_server.utils.sgmv import (
+    BGMV_MAX_RANK,
+    MAX_RANK_CUSTOM,
+    get_tmp_tensors,
+    orient_for_rank,
+    pad_rank,
+    use_cutlass_shrink,
+)
+
+if TYPE_CHECKING:
+    from text_generation_server.models.model import Model
+
+
+def get_start_stop_idxs_for_rank(offset, size, rank, world_size):
+    block_size = size // world_size
+    start = offset + rank * block_size
+    stop = offset + (rank + 1) * block_size
+    return start, stop
+
+
+def shard_on_dim(
+    t: torch.Tensor, dim: int, process_group: torch.distributed.ProcessGroup
+):
+    world_size = process_group.size()
+    rank = process_group.rank()
+
+    size = t.shape[dim]
+    start, stop = get_start_stop_idxs_for_rank(0, size, rank, world_size)
+
+    if dim == 0:
+        tensor = t[start:stop]
+    elif dim == 1:
+        tensor = t[:, start:stop]
+    else:
+        raise NotImplementedError("Let's make that generic when needed")
+
+    return tensor
+
+
+def shard_lora_weights(
+    weights_a: List[torch.Tensor],
+    weights_b: List[torch.Tensor],
+    split_dim: int,
+    process_group: ProcessGroup,
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    # [hidden_size, r]
+    weights_a = [
+        shard_on_dim(w, dim=split_dim, process_group=process_group) for w in weights_a
+    ]
+
+    # [r, hidden_size]
+    weights_b = [shard_on_dim(w, dim=1, process_group=process_group) for w in weights_b]
+
+    return weights_a, weights_b
+
+
+@dataclass
+class LoraConfig(AdapterConfig):
+    r: int
+    target_modules: Optional[Union[List[str], str]]
+    fan_in_fan_out: bool
+    lora_alpha: int
+    use_rslora: bool
+
+    def map_weights_for_model(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        weight_names: Tuple[str],
+    ) -> Tuple[ModuleMap, Set[str]]:
+        adapter_weight_names = set()
+        module_map = {}
+        for weight_name in weight_names:
+            lora_a_name = f"base_model.model.{weight_name}.lora_A.weight"
+            lora_b_name = f"base_model.model.{weight_name}.lora_B.weight"
+            if lora_a_name not in adapter_weights or lora_b_name not in adapter_weights:
+                continue
+
+            module_map[weight_name] = {
+                "lora_A": (adapter_weights[lora_a_name], lora_a_name),
+                "lora_B": (adapter_weights[lora_b_name], lora_b_name),
+            }
+            adapter_weight_names.add(lora_a_name)
+            adapter_weight_names.add(lora_b_name)
+        return module_map, adapter_weight_names
+
+    def load_batched_adapter_weights(
+        self,
+        model: "Model",
+        module_map: Dict[str, Dict],
+        layer_type: str,
+        unused_weight_names: Set[str],
+        dynamic: bool,
+    ) -> Optional[AdapterWeights]:
+        return LoraWeights.load(
+            self,
+            model,
+            module_map,
+            layer_type,
+            unused_weight_names,
+        )
+
+    @classmethod
+    def load(cls, adapter_id: str, api_token: str) -> "LoraConfig":
+        hf_config = _LoraConfig.from_pretrained(adapter_id, token=api_token)
+        return cls(
+            base_model_name_or_path=hf_config.base_model_name_or_path,
+            r=hf_config.r,
+            target_modules=hf_config.target_modules,
+            fan_in_fan_out=hf_config.fan_in_fan_out,
+            lora_alpha=hf_config.lora_alpha,
+            use_rslora=(
+                hf_config.use_rslora if hasattr(hf_config, "use_rslora") else False
+            ),
+        )
+
+
+class LoraWeights(AdapterWeights):
+    """LoRA weights for a single adapter merged across all layers."""
+
+    def __init__(
+        self,
+        weights_a: List[torch.Tensor],
+        weights_b: List[torch.Tensor],
+        adapter_config: LoraConfig,
+    ):
+        self.lora_a_r = weights_a[0].size(1) if len(weights_a) > 0 else 1
+        self.lora_b_r = weights_b[0].size(0) if len(weights_a) > 0 else 1
+
+        self._use_cutlass_shrink = use_cutlass_shrink(self.lora_a_r)
+        self._is_transposed = False
+
+        # [num_layers, hidden_size, r]
+        weights_a = [orient_for_rank(w, w.size(1)).contiguous() for w in weights_a]
+        self._weights_a = torch.stack(weights_a)
+
+        # [num_layers, r, hidden_size]
+        self._weights_b = torch.stack(weights_b)
+
+        self.adapter_config = adapter_config
+
+    @property
+    def weights_a(self) -> torch.Tensor:
+        if self._is_transposed:
+            self._transpose_weights()
+        return self._weights_a
+
+    @property
+    def weights_b(self) -> torch.Tensor:
+        if self._is_transposed:
+            self._transpose_weights()
+        return self._weights_b
+
+    @property
+    def weights_a_t(self) -> torch.Tensor:
+        if not self._is_transposed:
+            self._transpose_weights()
+        return self._weights_a
+
+    @property
+    def weights_b_t(self) -> torch.Tensor:
+        if not self._is_transposed:
+            self._transpose_weights()
+        return self._weights_b
+
+    def _transpose_weights(self):
+        if self._use_cutlass_shrink:
+            # If we're not using the cutlass shrink, then both SGMV and BGMV use the same orientation
+            self._weights_a = self._weights_a.transpose(1, 2).contiguous()
+        self._weights_b = self._weights_b.transpose(1, 2).contiguous()
+        self._is_transposed = not self._is_transposed
+
+    @classmethod
+    def get_batch_types(cls) -> List[Type[BatchAdapterWeights]]:
+        return [BatchLoraWeights]
+
+    @classmethod
+    def load(
+        cls,
+        config: LoraConfig,
+        model: "Model",
+        module_map: Dict[str, Dict],
+        layer_type: str,
+        unused_weight_names: Set[str],
+    ) -> Optional[AdapterWeights]:
+        nlayers = model.get_num_layers_for_type(layer_type)
+        lora_a_list = [None] * nlayers
+        lora_b_list = [None] * nlayers
+
+        for layer_id in range(nlayers):
+            key = (layer_id, layer_type)
+            weight_name, layer = model.target_to_layer[key]
+            base_weight = layer.base_layer.linear.weight
+            base_device = base_weight.device
+
+            if weight_name not in module_map:
+                # There is no LoRA weight for this layer type in the adapter
+                return None
+
+            lora_a, lora_a_name = module_map[weight_name]["lora_A"]
+            lora_a = lora_a.to(base_device, model.dtype)
+
+            lora_b, lora_b_name = module_map[weight_name]["lora_B"]
+            lora_b = lora_b.to(base_device, model.dtype)
+
+            scale = get_scaling_factor(
+                config.lora_alpha,
+                config.r,
+                uses_rslora=config.use_rslora,
+            )
+
+            unused_weight_names.discard(lora_a_name)
+            unused_weight_names.discard(lora_b_name)
+
+            # Merge scaling factor into lora_b due to associativity of matrix multiplication:
+            # (A * B) * C = A * (B * C)
+            lora_a_list[layer_id] = lora_a.transpose(0, 1)
+            lora_b_list[layer_id] = lora_b.transpose(0, 1) * scale
+
+        # pad lora ranks to be compatible with sgmv
+        lora_a_list = [
+            pad_rank(w, dim=1, world_size=model.world_size) for w in lora_a_list
+        ]
+        lora_b_list = [
+            pad_rank(w, dim=0, world_size=model.world_size) for w in lora_b_list
+        ]
+
+        if lora_a_list:
+            # update rank if it was padded
+            padded_rank = lora_a_list[0].size(1)
+            config.r = padded_rank
+
+        return LoraWeights(
+            *shard_lora_weights(
+                weights_a=lora_a_list,
+                weights_b=lora_b_list,
+                split_dim=0 if model.is_row_parallel(layer_type) else 1,
+                process_group=model.process_group,
+            ),
+            config,
+        )
+
+
+@dataclass
+class RankSegments:
+    rank: int
+
+    lora_a_ptr: torch.Tensor
+    lora_b_ptr: torch.Tensor
+
+    # prefill (sgmv)
+    tmp_shrink: torch.Tensor
+    tmp_expand: torch.Tensor
+    segment_starts: torch.Tensor
+    segment_ends: torch.Tensor
+
+    # decode (bgmv)
+    indices: torch.Tensor
+
+
+@dataclass
+class BatchLoraWeights(BatchAdapterWeights):
+    lora_a: Dict[int, torch.Tensor]
+    lora_b: Dict[int, torch.Tensor]
+    adapter_index_configs: Dict[int, LoraConfig]
+    rank_data: Dict[int, RankSegments]
+    use_sgmv: bool
+
+    def has_adapter(self, adapter_index: int) -> bool:
+        return adapter_index in self.adapter_index_configs
+
+    def can_vectorize(self, pg: ProcessGroup) -> bool:
+        return all(
+            rank_data.rank // pg.size() <= MAX_RANK_CUSTOM
+            for rank_data in self.rank_data.values()
+        )
+
+    @classmethod
+    def key(cls) -> str:
+        return "lora"
+
+    @classmethod
+    def load(
+        self,
+        adapter_weights: Dict[int, AdapterWeights],
+        meta: AdapterBatchMetadata,
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> Optional["BatchLoraWeights"]:
+        adapter_weights = {k: _convert_lora(v) for k, v in adapter_weights.items()}
+        adapter_weights = {
+            k: v for k, v in adapter_weights.items() if isinstance(v, LoraWeights)
+        }
+        if not adapter_weights:
+            return None
+
+        first_weights = next(iter(adapter_weights.values()))
+        device = first_weights.weights_a.device
+        segment_indices = meta.segment_indices
+
+        lora_a = {
+            idx: adapter_weights[idx].weights_a
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+        lora_b = {
+            idx: adapter_weights[idx].weights_b
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+
+        max_rank = max(
+            (
+                adapter_weights[idx].lora_a_r
+                for idx in segment_indices
+                if idx in adapter_weights
+            ),
+            default=0,
+        )
+
+        if prefill or max_rank > BGMV_MAX_RANK:
+            use_sgmv = True
+            lora_a_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_a.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            lora_b_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_b.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+        else:
+            use_sgmv = False
+            lora_a_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_a_t.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            lora_b_ptr = torch.tensor(
+                [
+                    (
+                        adapter_weights[idx].weights_b_t.data_ptr()
+                        if idx in adapter_weights
+                        else 0
+                    )
+                    for idx in segment_indices
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+
+        adapter_index_configs = {
+            idx: adapter_weights[idx].adapter_config
+            for idx in segment_indices
+            if idx in adapter_weights
+        }
+
+        adapter_to_segment = {v: k for k, v in enumerate(segment_indices)}
+
+        rank_indices = defaultdict(list)
+        for segment_idx, adapter_idx in enumerate(segment_indices):
+            if adapter_idx not in adapter_weights:
+                continue
+            rank_indices[adapter_weights[adapter_idx].lora_a_r].append(segment_idx)
+
+        if prefill_head_indices is not None:
+            j, prefill_head_segment_starts, prefill_head_segment_ends = 1, [0], [0]
+            for head_index in prefill_head_indices:
+                # j cannot go out of bounds as that would mean there are tokens without corresponding adapters
+                if head_index < meta.adapter_segments[j]:
+                    prefill_head_segment_ends[-1] += 1
+                else:
+                    prefill_head_segment_starts.append(prefill_head_segment_ends[-1])
+                    prefill_head_segment_ends.append(prefill_head_segment_ends[-1] + 1)
+                    j += 1
+
+        rank_data = {}
+        for rank, indices in rank_indices.items():
+            tmp_shrink = None
+            tmp_expand = None
+            segment_starts = None
+            segment_ends = None
+            batch_indices = None
+
+            if use_sgmv:
+                lora_a_ptr_indices = lora_a_ptr[indices]
+                tmp_shrink, tmp_expand = get_tmp_tensors(
+                    lora_a_ptr_indices.size(0), rank, device
+                )
+                segment_starts = meta.adapter_segments[indices]
+                segment_ends = meta.adapter_segments[[i + 1 for i in indices]]
+                if prefill_head_indices is not None:
+                    for i, segment_index in enumerate(indices):
+                        segment_starts[i] = prefill_head_segment_starts[segment_index]
+                        segment_ends[i] = prefill_head_segment_ends[segment_index]
+            else:
+                rank_indices = set(indices)
+                batch_indices = [
+                    adapter_to_segment[idx] for idx in meta.adapter_indices.tolist()
+                ]
+                batch_indices = [
+                    idx if idx in rank_indices else -1 for idx in batch_indices
+                ]
+                batch_indices = torch.tensor(
+                    batch_indices, dtype=torch.int64, device=device
+                )
+
+            rank_data[rank] = RankSegments(
+                rank=rank,
+                tmp_shrink=tmp_shrink,
+                tmp_expand=tmp_expand,
+                lora_a_ptr=lora_a_ptr[indices],
+                lora_b_ptr=lora_b_ptr[indices],
+                segment_starts=segment_starts,
+                segment_ends=segment_ends,
+                indices=batch_indices,
+            )
+
+        return BatchLoraWeights(
+            lora_a=lora_a,
+            lora_b=lora_b,
+            adapter_index_configs=adapter_index_configs,
+            rank_data=rank_data,
+            use_sgmv=use_sgmv,
+        )
+
+
+def get_scaling_factor(
+    lora_alpha: int,
+    r: int,
+    uses_rslora: bool = False,
+) -> float:
+    """Computes the scaling factor for the lora weights."""
+    if uses_rslora:
+        return lora_alpha / (r**0.5)
+    return lora_alpha / r
+
+
+def _convert_lora(v: AdapterWeights) -> AdapterWeights:
+    if hasattr(v, "lora_weights"):
+        return v.lora_weights
+    return v
diff --git a/server/text_generation_server/adapters/weights.py b/server/text_generation_server/adapters/weights.py
new file mode 100644
index 00000000..8f658756
--- /dev/null
+++ b/server/text_generation_server/adapters/weights.py
@@ -0,0 +1,158 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/adapters/weights.py
+# License:  Apache License Version 2.0, January 2004
+
+from abc import ABC, abstractclassmethod
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Set, Type
+
+import torch
+
+
+@dataclass
+class AdapterBatchMetadata:
+    # [batch_size]
+    adapter_indices: torch.Tensor
+
+    # [num_adapters]
+    adapter_set: Set[int]
+
+    # [num_segments + 1]
+    adapter_segments: torch.Tensor
+
+    # [num_segments]
+    # maps from segment index to adapter index, i.e.:
+    # segment_indices[s] == adapter_indices[i]
+    segment_indices: List[int]
+
+
+class AdapterWeights(ABC):
+    @abstractclassmethod
+    def get_batch_types(cls) -> List[Type["BatchAdapterWeights"]]:
+        pass
+
+    @property
+    def speculative_tokens(self) -> int:
+        return 0
+
+
+class BatchAdapterWeights(ABC):
+    @abstractclassmethod
+    def has_adapter(self, adapter_index: int) -> bool:
+        pass
+
+    @abstractclassmethod
+    def key(cls) -> str:
+        pass
+
+    @abstractclassmethod
+    def load(
+        cls,
+        adapter_weights: Dict[int, AdapterWeights],
+        meta: "AdapterBatchMetadata",
+        prefill: bool,
+        prefill_head_indices: torch.Tensor,
+    ) -> Optional["BatchAdapterWeights"]:
+        pass
+
+
+class LayerAdapterWeights:
+    """Adapter weights that apply to a particular layer."""
+
+    def __init__(self):
+        self.adapter_weights: Dict[int, AdapterWeights] = {}
+
+    def add_adapter(self, adapter_idx: int, weights: AdapterWeights):
+        self.adapter_weights[adapter_idx] = weights
+
+    def remove_adapter(self, adapter_idx: int):
+        if adapter_idx not in self.adapter_weights:
+            return
+        del self.adapter_weights[adapter_idx]
+
+    @property
+    def max_speculative_tokens(self) -> int:
+        return max(
+            adapter_weights.speculative_tokens
+            for adapter_weights in self.adapter_weights.values()
+        )
+
+    def is_empty(self) -> bool:
+        return len(self.adapter_weights) == 0
+
+    def get_data(
+        self,
+        meta: AdapterBatchMetadata,
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> Dict[str, BatchAdapterWeights]:
+        # bucket adapters by batch class
+        adapter_batch_types: Dict[
+            Type[BatchAdapterWeights], Dict[int, AdapterWeights]
+        ] = defaultdict(dict)
+        for adapter_index, adapter_weights in self.adapter_weights.items():
+            for batch_type in adapter_weights.get_batch_types():
+                adapter_batch_types[batch_type][adapter_index] = adapter_weights
+
+        batch_data = {}
+        for batch_type, adapter_weights in adapter_batch_types.items():
+            batched_weights = batch_type.load(
+                adapter_weights, meta, prefill, prefill_head_indices
+            )
+            if batched_weights is not None:
+                batch_data[batch_type.key()] = batched_weights
+        return batch_data
+
+
+@dataclass
+class AdapterBatchData:
+    meta: AdapterBatchMetadata
+
+    # layer type -> adapter type -> batch weight data
+    data: Dict[str, Dict[str, BatchAdapterWeights]]
+
+    prefill: bool
+
+    @staticmethod
+    def from_meta(
+        meta: AdapterBatchMetadata,
+        weights: Dict[str, LayerAdapterWeights],
+        prefill: bool,
+        prefill_head_indices: Optional[torch.Tensor],
+    ) -> "AdapterBatchData":
+        data = {}
+        for k, v in weights.items():
+            if v.is_empty():
+                continue
+            data[k] = v.get_data(
+                meta, prefill, prefill_head_indices if k == "lm_head" else None
+            )
+        return AdapterBatchData(meta=meta, data=data, prefill=prefill)
+
+    def ranks(self) -> Set[int]:
+        # TODO(travis): refactor to be less coupled to lora implementation
+        ranks = set()
+        for layer_data in self.data.values():
+            lora_data = layer_data.get("lora")
+            if lora_data is None:
+                continue
+
+            for rank_data in lora_data.rank_data.values():
+                ranks.add(rank_data.rank)
+
+        return ranks
+
+    def layer_names(self) -> Set[str]:
+        return set(self.data.keys())
+
+    def adapter_keys(self) -> Set[str]:
+        adapter_keys = set()
+        for layer_data in self.data.values():
+            adapter_keys.update(layer_data.keys())
+        return adapter_keys
+
+    @property
+    def max_rank(self) -> int:
+        ranks = self.ranks()
+        return max(ranks) if len(ranks) > 0 else 0
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index e3fda07f..68ae95dd 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -17,6 +17,11 @@ class Quantization(str, Enum):
     bitsandbytes_nf4 = "bitsandbytes-nf4"
     bitsandbytes_fp4 = "bitsandbytes-fp4"
     gptq = "gptq"
+    awq = "awq"
+    eetq = "eetq"
+    exl2 = "exl2"
+    fp8 = "fp8"
+    marlin = "marlin"
 
 
 class Dtype(str, Enum):
@@ -30,12 +35,15 @@ def serve(
     revision: Optional[str] = None,
     sharded: bool = False,
     quantize: Optional[Quantization] = None,
+    speculate: Optional[int] = None,
     dtype: Optional[Dtype] = None,
     trust_remote_code: bool = False,
     uds_path: Path = "/tmp/text-generation-server",
     logger_level: str = "INFO",
     json_output: bool = False,
     otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
 ):
     if sharded:
         assert (
@@ -69,17 +77,43 @@ def serve(
 
     # Setup OpenTelemetry distributed tracing
     if otlp_endpoint is not None:
-        setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
+        setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
+
+    lora_adapter_ids = os.getenv("LORA_ADAPTERS", None)
+
+    # split on comma and strip whitespace
+    lora_adapter_ids = (
+        [x.strip() for x in lora_adapter_ids.split(",")] if lora_adapter_ids else []
+    )
+
+    if len(lora_adapter_ids) > 0:
+        logger.warning(
+            f"LoRA adapters are enabled. This is an experimental feature and may not work as expected."
+        )
 
     # Downgrade enum into str for easier management later on
     quantize = None if quantize is None else quantize.value
     dtype = None if dtype is None else dtype.value
-    if dtype is not None and quantize is not None:
+    if dtype is not None and quantize not in {
+        None,
+        "bitsandbytes",
+        "bitsandbytes-nf4",
+        "bitsandbytes-fp4",
+    }:
         raise RuntimeError(
             "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
         )
     server.serve(
-        model_id, revision, sharded, quantize, dtype, trust_remote_code, uds_path
+        model_id,
+        lora_adapter_ids,
+        revision,
+        sharded,
+        quantize,
+        speculate,
+        dtype,
+        trust_remote_code,
+        uds_path,
+        max_input_tokens,
     )
 
 
@@ -92,6 +126,7 @@ def download_weights(
     logger_level: str = "INFO",
     json_output: bool = False,
     trust_remote_code: bool = False,
+    merge_lora: bool = False,
 ):
     # Remove default handler
     logger.remove()
@@ -114,7 +149,7 @@ def download_weights(
         logger.info("Files are already present on the host. " "Skipping download.")
         return
     # Local files not found
-    except (utils.LocalEntryNotFoundError, FileNotFoundError):
+    except (utils.LocalEntryNotFoundError, FileNotFoundError, utils.EntryNotFoundError):
         pass
 
     is_local_model = (Path(model_id).exists() and Path(model_id).is_dir()) or os.getenv(
@@ -122,12 +157,53 @@ def download_weights(
     ) is not None
 
     if not is_local_model:
+        # TODO: maybe reverse the default value of merge_lora?
+        # currently by default we don't merge the weights with the base model
+        if merge_lora:
+            try:
+                adapter_config_filename = hf_hub_download(
+                    model_id, revision=revision, filename="adapter_config.json"
+                )
+                utils.download_and_unload_peft(
+                    model_id, revision, trust_remote_code=trust_remote_code
+                )
+                is_local_model = True
+                utils.weight_files(model_id, revision, extension)
+                return
+            except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+                pass
+        else:
+            try:
+                utils.peft.download_peft(
+                    model_id, revision, trust_remote_code=trust_remote_code
+                )
+            except Exception:
+                pass
+
         try:
-            adapter_config_filename = hf_hub_download(model_id, revision=revision, filename="adapter_config.json")
-            utils.download_and_unload_peft(model_id, revision, trust_remote_code=trust_remote_code)
-            is_local_model = True
-            utils.weight_files(model_id, revision, extension)
-            return
+            import json
+
+            config = hf_hub_download(
+                model_id, revision=revision, filename="config.json"
+            )
+            with open(config, "r") as f:
+                config = json.load(f)
+
+            base_model_id = config.get("base_model_name_or_path", None)
+            if base_model_id and base_model_id != model_id:
+                try:
+                    logger.info(f"Downloading parent model {base_model_id}")
+                    download_weights(
+                        model_id=base_model_id,
+                        revision="main",
+                        extension=extension,
+                        auto_convert=auto_convert,
+                        logger_level=logger_level,
+                        json_output=json_output,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass
         except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
             pass
 
@@ -144,13 +220,53 @@ def download_weights(
             if not extension == ".safetensors" or not auto_convert:
                 raise e
 
+    elif (Path(model_id) / "adapter_config.json").exists():
+        # Try to load as a local PEFT model
+        try:
+            utils.download_and_unload_peft(
+                model_id, revision, trust_remote_code=trust_remote_code
+            )
+            utils.weight_files(model_id, revision, extension)
+            return
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+    elif (Path(model_id) / "config.json").exists():
+        # Try to load as a local Medusa model
+        try:
+            import json
+
+            config = Path(model_id) / "config.json"
+            with open(config, "r") as f:
+                config = json.load(f)
+
+            base_model_id = config.get("base_model_name_or_path", None)
+            if base_model_id:
+                try:
+                    logger.info(f"Downloading parent model {base_model_id}")
+                    download_weights(
+                        model_id=base_model_id,
+                        revision="main",
+                        extension=extension,
+                        auto_convert=auto_convert,
+                        logger_level=logger_level,
+                        json_output=json_output,
+                        trust_remote_code=trust_remote_code,
+                    )
+                except Exception:
+                    pass
+        except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
+            pass
+
     # Try to see if there are local pytorch weights
     try:
         # Get weights for a local model, a hub cached model and inside the WEIGHTS_CACHE_OVERRIDE
-        local_pt_files = utils.weight_files(model_id, revision, ".bin")
+        try:
+            local_pt_files = utils.weight_files(model_id, revision, ".bin")
+        except Exception:
+            local_pt_files = utils.weight_files(model_id, revision, ".pt")
 
     # No local pytorch weights
-    except utils.LocalEntryNotFoundError:
+    except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
         if extension == ".safetensors":
             logger.warning(
                 f"No safetensors weights found for model {model_id} at revision {revision}. "
@@ -163,6 +279,13 @@ def download_weights(
         local_pt_files = utils.download_weights(pt_filenames, model_id, revision)
 
     if auto_convert:
+        if not trust_remote_code:
+            logger.warning(
+                f"🚨🚨BREAKING CHANGE in 2.0🚨🚨: Safetensors conversion is disabled without `--trust-remote-code` because "
+                f"Pickle files are unsafe and can essentially contain remote code execution!"
+                f"Please check for more information here: https://huggingface.co/docs/text-generation-inference/basic_tutorials/safety",
+            )
+
         logger.warning(
             f"No safetensors weights found for model {model_id} at revision {revision}. "
             f"Converting PyTorch weights to safetensors."
@@ -177,8 +300,12 @@ def download_weights(
             import transformers
             import json
 
-
-            config_filename = hf_hub_download(model_id, revision=revision, filename="config.json")
+            if is_local_model:
+                config_filename = os.path.join(model_id, "config.json")
+            else:
+                config_filename = hf_hub_download(
+                    model_id, revision=revision, filename="config.json"
+                )
             with open(config_filename, "r") as f:
                 config = json.load(f)
             architecture = config["architectures"][0]
@@ -187,7 +314,6 @@ def download_weights(
 
             # Name for this varible depends on transformers version.
             discard_names = getattr(class_, "_tied_weights_keys", [])
-            discard_names.extend(getattr(class_, "_keys_to_ignore_on_load_missing", []))
 
         except Exception as e:
             discard_names = []
@@ -215,7 +341,7 @@ def quantize(
         logger_level=logger_level,
         json_output=json_output,
     )
-    from text_generation_server.utils.gptq.quantize import quantize
+    from text_generation_server.layers.gptq.quantize import quantize
 
     quantize(
         model_id=model_id,
diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py
index 725105f3..57df1725 100644
--- a/server/text_generation_server/interceptor.py
+++ b/server/text_generation_server/interceptor.py
@@ -23,6 +23,10 @@ class ExceptionInterceptor(AsyncServerInterceptor):
             method_name = method_name.split("/")[-1]
             logger.exception(f"Method {method_name} encountered an error.")
 
+            # Runtime Error cannot be recovered from
+            if isinstance(err, RuntimeError):
+                exit(1)
+
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
 
diff --git a/server/text_generation_server/layers/__init__.py b/server/text_generation_server/layers/__init__.py
new file mode 100644
index 00000000..32c8d121
--- /dev/null
+++ b/server/text_generation_server/layers/__init__.py
@@ -0,0 +1,20 @@
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+    TensorParallelEmbedding,
+)
+from text_generation_server.layers.linear import (
+    get_linear,
+    FastLinear,
+)
+from text_generation_server.layers.speculative import SpeculativeHead
+
+# Just to add the `load` methods.
+from text_generation_server.layers.layernorm import load_layer_norm
+from text_generation_server.layers.conv import load_conv2d
+
+from text_generation_server.layers.lora import (
+    LoraLinear,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py
new file mode 100644
index 00000000..c8bccefe
--- /dev/null
+++ b/server/text_generation_server/layers/attention/__init__.py
@@ -0,0 +1,15 @@
+from text_generation_server.utils.import_utils import SYSTEM
+import os
+
+from .common import Seqlen
+
+if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
+    raise ImportError("`USE_FLASH_ATTENTION` is false.")
+if SYSTEM == "cuda":
+    from .cuda import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+elif SYSTEM == "rocm":
+    from .rocm import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+elif SYSTEM == "ipex":
+    from .ipex import attention, paged_attention, reshape_and_cache, SUPPORTS_WINDOWING
+else:
+    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
diff --git a/server/text_generation_server/layers/attention/common.py b/server/text_generation_server/layers/attention/common.py
new file mode 100644
index 00000000..bd0717ce
--- /dev/null
+++ b/server/text_generation_server/layers/attention/common.py
@@ -0,0 +1,44 @@
+from dataclasses import dataclass
+from text_generation_server.models.globals import FLASH_DECODING
+import torch
+from typing import Optional
+
+
+if FLASH_DECODING:
+
+    @dataclass
+    class Seqlen:
+        input_lengths: torch.Tensor
+        cu_seqlen_q: Optional[torch.Tensor]
+        cu_seqlen_k: Optional[torch.Tensor]
+
+        def __init__(self, input_lengths):
+            self.input_lengths = input_lengths
+            device = self.input_lengths.device
+            shape = self.input_lengths.shape
+            cu_seqlen_q = torch.arange(
+                shape[0] + 1,
+                device=device,
+                dtype=torch.int32,
+            )
+            cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
+            # cuda graphs don't like this and this is necessary to clamp within mistral
+            # Although FA2 might not want the clamping
+            # cu_seqlen_k[0] = 0
+            torch.cumsum(self.input_lengths, -1, out=cu_seqlen_k[1:])
+
+            self.cu_seqlen_q = cu_seqlen_q
+            self.cu_seqlen_k = cu_seqlen_k
+
+        def clamp(self, max):
+            # Flash decoding doesn't need to clamp
+            return self
+
+else:
+
+    @dataclass
+    class Seqlen:
+        input_lengths: torch.Tensor
+
+        def clamp(self, max):
+            return Seqlen(torch.clamp(self.input_lengths, max=max))
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
new file mode 100644
index 00000000..94b69899
--- /dev/null
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -0,0 +1,292 @@
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import FLASH_DECODING, BLOCK_SIZE
+from text_generation_server.layers.attention import Seqlen
+
+major, minor = torch.cuda.get_device_capability()
+is_sm75 = major == 7 and minor == 5
+_PARTITION_SIZE = 512
+
+try:
+    from vllm._C import cache_ops
+    from vllm._C import ops
+except Exception as e:
+    raise ImportError(
+        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+    )
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    if FLASH_DECODING:
+        shape = key_cache.shape
+        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
+        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+    else:
+        cache_ops.reshape_and_cache(
+            key, value, key_cache, value_cache, slots, "auto", 1.0
+        )
+
+
+def paged_attention(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    block_tables: torch.Tensor,
+    seqlen: Seqlen,
+    max_s: int,
+):
+    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
+    # Copyright 2023 The vLLM team. All rights
+    # reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    #
+
+    # value_cache => [num_blocks, num_heads, head_size, block_size]
+    # block_size = value_cache.shape[3]
+    block_size = BLOCK_SIZE
+    num_seqs, num_heads, head_size = query.shape
+    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
+
+    # NOTE(woosuk): We use a simple heuristic to decide whether to use
+    # PagedAttention V1 or V2. If the number of partitions is 1, we use
+    # V1 to avoid the overhead of reduction. Also, if the number of
+    # sequences or heads is large, we use V1 since there is enough work
+    # to parallelize.
+    if FLASH_DECODING:
+        max_q = 1
+        max_k = max_s
+        import flash_attn_2_cuda
+
+        # TODO fixme when flash contains the fix.
+        # Number of splits is not correctly handled
+        # by the current path
+        # https://github.com/Dao-AILab/flash-attention/blob/320fb59487658f033f56711efd3d61b7c7a6f8f3/csrc/flash_attn/flash_api.cpp#L577
+        # This fails becuase we're using causal, therefore window_right is set to 0 and the split logic is never applied.
+        out2 = flash_attn_2_cuda.varlen_fwd(
+            query,
+            key_cache,
+            value_cache,
+            None,
+            seqlen.cu_seqlen_q,
+            seqlen.cu_seqlen_k,
+            None,
+            block_tables,
+            None,
+            max_q,
+            max_k,
+            0.0,  # dropout
+            softmax_scale,
+            False,  # zero_tensors
+            True,  # causal
+            -1,  # Window_left
+            -1,  # Window right
+            False,  # return softmax
+            None,  # generator
+        )
+        return out2[0]
+    else:
+        input_lengths = seqlen.input_lengths
+        from vllm._C import ops
+
+        use_v1 = max_s <= 8192 and (
+            max_num_partitions == 1 or num_seqs * num_heads > 512
+        )
+        if use_v1:
+            ops.paged_attention_v1(
+                out,
+                query,
+                key_cache,
+                value_cache,
+                kv_head_mapping,
+                softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+                None,
+                "auto",
+                1.0,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=out.dtype,
+                device=out.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=out.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+
+            ops.paged_attention_v2(
+                out,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                kv_head_mapping,
+                softmax_scale,
+                block_tables,
+                input_lengths,
+                block_size,
+                max_s,
+                None,
+                "auto",
+                1.0,
+            )
+    return out
+
+
+try:
+    import flash_attn_2_cuda
+
+    V2 = True
+except ImportError:
+    try:
+        import flash_attn_cuda
+
+        V2 = False
+    except ImportError as e:
+        if major >= 8:
+            architecture_suffix = f"-{SYSTEM}"
+            raise ImportError(
+                "Flash Attention V2 is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
+            )
+        elif is_sm75:
+            raise ImportError(
+                "Flash Attention is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                "or install flash attention with `cd server && make install install-flash-attention`"
+            ) from e
+        else:
+            raise ImportError(
+                f"GPU with CUDA capability {major} {minor} is not supported"
+            ) from e
+
+
+SUPPORTS_WINDOWING = V2
+if V2:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+        causal=True,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
+        return flash_attn_2_cuda.varlen_fwd(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            None,
+            None,
+            None,
+            max_s,
+            max_s,
+            0.0,
+            softmax_scale,
+            False,
+            causal,
+            window_size_left,
+            0,
+            False,
+            None,
+        )
+
+else:
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+    ):
+        if window_size_left != -1:
+            raise NotImplementedError(
+                "window_size_left is only available with flash attn v2"
+            )
+
+        # Flash attention v1 requires q, k and v to have the same number of heads
+        if k.shape[1] != q.shape[1]:
+            # MQA expand
+            if k.shape[1] == 1:
+                k = k.expand(-1, q.shape[1], -1)
+            # Grouped attention reshape
+            else:
+                original_shape = k.shape
+                k = (
+                    k.unsqueeze(2)
+                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
+                    .reshape(original_shape[0], -1, original_shape[2])
+                )
+        if v.shape[1] != q.shape[1]:
+            # MQA expand
+            if v.shape[1] == 1:
+                v = v.expand(-1, q.shape[1], -1)
+            # Grouped attention reshape
+            else:
+                original_shape = v.shape
+                v = (
+                    v.unsqueeze(2)
+                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
+                    .reshape(original_shape[0], -1, original_shape[2])
+                )
+
+        return flash_attn_cuda.fwd(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            0.0,
+            softmax_scale,
+            False,
+            True,
+            False,
+            0,
+            None,
+        )
diff --git a/server/text_generation_server/layers/attention/flash_attn_triton.py b/server/text_generation_server/layers/attention/flash_attn_triton.py
new file mode 100644
index 00000000..3fe32231
--- /dev/null
+++ b/server/text_generation_server/layers/attention/flash_attn_triton.py
@@ -0,0 +1,816 @@
+#!/usr/bin/env python
+"""
+Fused Attention
+===============
+
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao
+(https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team, AMD ML Frameworks Triton team
+
+Features supported:
+
+1) Fwd with causal masking
+2) Any sequence lengths without padding (currently fwd kernel only)
+3) Support for different sequence lengths for q and k
+4) Nested tensor API currently does not support dropout or bias.
+
+Not currently supported:
+
+1) Non power of two head dims
+
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+torch_dtype: tl.constexpr = torch.float16
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def max_fn(x, y):
+    return tl.math.max(x, y)
+
+
+@triton.jit
+def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride):
+    ms = tl.arange(0, m)
+    ns = tl.arange(0, n)
+    return philox_offset + ms[:, None] * stride + ns[None, :]
+
+
+@triton.jit
+def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_offsets = dropout_offsets(
+        philox_seed, philox_offset, dropout_p, m, n, stride
+    ).to(tl.uint32)
+    # TODO: use tl.randint for better performance
+    return tl.rand(philox_seed, rng_offsets)
+
+
+@triton.jit
+def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride):
+    rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride)
+    rng_keep = rng_output > dropout_p
+    return rng_keep
+
+
+@triton.jit
+def load_fn(block_ptr, first, second, pad):
+    if first and second:
+        tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad)
+    elif first:
+        tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad)
+    elif second:
+        tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad)
+    else:
+        tensor = tl.load(block_ptr)
+    return tensor
+
+
+@triton.jit
+def _attn_fwd_inner(
+    acc,
+    l_i,
+    m_i,
+    q,
+    K_block_ptr,
+    V_block_ptr,
+    start_m,
+    actual_seqlen_k,
+    dropout_p,
+    philox_seed,
+    batch_philox_offset,
+    encoded_softmax_block_ptr,
+    block_min,
+    block_max,
+    offs_n_causal,
+    masked_blocks,
+    n_extra_tokens,
+    bias_ptr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    OFFS_M: tl.constexpr,
+    OFFS_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    MASK_STEPS: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+    PADDED_HEAD: tl.constexpr,
+):
+    # loop over k, v, and update accumulator
+    for start_n in range(block_min, block_max, BLOCK_N):
+        # For padded blocks, we will overrun the tensor size if
+        # we load all BLOCK_N. For others, the blocks are all within range.
+        k = load_fn(
+            K_block_ptr,
+            PADDED_HEAD,
+            MASK_STEPS and (n_extra_tokens != 0),
+            "zero",
+        )
+        if PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # We start from end of seqlen_k so only the first iteration would need
+        # to be checked for padding if it is not a multiple of block_n
+        # TODO: This can be optimized to only be true for the padded block.
+        if MASK_STEPS:  # noqa: SIM102
+            # If this is the last block / iteration, we want to
+            # mask if the sequence length is not a multiple of block size
+            # a solution is to always do BLOCK_M // BLOCK_N + 1 steps
+            # if not is_modulo_mn. last step might get wasted but that is okay.
+            # check if this masking works for that case.
+            if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0):
+                boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32)
+                size_n = start_n + OFFS_N[None, :]
+                mask = size_n < boundary_m[:, None]
+                qk = tl.where(mask, qk, float("-inf"))
+        if IS_CAUSAL:
+            causal_boundary = start_n + offs_n_causal
+            causal_mask = OFFS_M[:, None] >= causal_boundary[None, :]
+            qk = tl.where(causal_mask, qk, float("-inf"))
+        # -- compute qk ----
+        qk += tl.dot(q, k)
+        if bias_ptr is not None:
+            bias = load_fn(
+                bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero"
+            )
+            # While bias is added after multiplying qk with sm_scale, our
+            # optimization to use 2^x instead of e^x results in an additional
+            # scale factor of log2(e) which we must also multiply the bias with.
+            qk += bias * 1.44269504089
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        qk = qk - m_ij[:, None]
+        p = tl.math.exp2(qk)
+
+        # CAVEAT: Must update l_ij before applying dropout
+        l_ij = tl.sum(p, 1)
+        if ENABLE_DROPOUT:
+            philox_offset = (
+                batch_philox_offset
+                + start_m * BLOCK_M * actual_seqlen_k
+                + start_n
+                - BLOCK_N
+            )
+            keep = dropout_mask(
+                philox_seed,
+                philox_offset,
+                dropout_p,
+                BLOCK_M,
+                BLOCK_N,
+                actual_seqlen_k,
+            )
+            if RETURN_ENCODED_SOFTMAX:
+                tl.store(
+                    encoded_softmax_block_ptr,
+                    tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty),
+                )
+            p = tl.where(keep, p, 0.0)
+        elif RETURN_ENCODED_SOFTMAX:
+            tl.store(
+                encoded_softmax_block_ptr,
+                p.to(encoded_softmax_block_ptr.type.element_ty),
+            )
+        # -- update output accumulator --
+        alpha = tl.math.exp2(m_i - m_ij)
+        acc = acc * alpha[:, None]
+        if not PRE_LOAD_V:
+            v = load_fn(
+                V_block_ptr,
+                MASK_STEPS and (n_extra_tokens != 0),
+                PADDED_HEAD,
+                "zero",
+            )
+        # -- update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        # update m_i and l_i
+        m_i = m_ij
+        acc += tl.dot(p.to(V_block_ptr.type.element_ty), v)
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(
+                encoded_softmax_block_ptr, (0, BLOCK_N)
+            )
+    return acc, l_i, m_i
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_M": 256,
+                "BLOCK_N": 64,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 256,
+                "BLOCK_N": 128,
+                "waves_per_eu": 2,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 3,
+                "PRE_LOAD_V": True,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 3,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 64,
+                "waves_per_eu": 4,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 32,
+                "BLOCK_N": 32,
+                "waves_per_eu": 4,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=8,
+        ),
+        # TODO: This config fails with head_size not pow2 with data mismatches.
+        #    triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1,
+        #                   'PRE_LOAD_V': False}, num_stages=1, num_warps=4),
+        triton.Config(
+            {
+                "BLOCK_M": 16,
+                "BLOCK_N": 16,
+                "waves_per_eu": 1,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "waves_per_eu": 1,
+                "PRE_LOAD_V": False,
+            },
+            num_stages=1,
+            num_warps=4,
+        ),
+    ],
+    key=["IS_CAUSAL", "dropout_p", "BLOCK_DMODEL"],
+)
+@triton.jit
+def attn_fwd(
+    Q,
+    K,
+    V,
+    bias,
+    sm_scale,
+    L,
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    stride_bz,
+    stride_bh,
+    stride_bm,
+    stride_bn,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    dropout_p,
+    philox_seed,
+    philox_offset_base,
+    encoded_softmax,
+    HQ: tl.constexpr,
+    HK: tl.constexpr,
+    ACTUAL_BLOCK_DMODEL: tl.constexpr,
+    MAX_SEQLENS_Q: tl.constexpr,
+    MAX_SEQLENS_K: tl.constexpr,
+    VARLEN: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    PRE_LOAD_V: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    ENABLE_DROPOUT: tl.constexpr,
+    RETURN_ENCODED_SOFTMAX: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_h_q = tl.program_id(1)
+    off_z = tl.program_id(2)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    if VARLEN:
+        cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z)
+        cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1)
+        seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start
+        # We have a one-size-fits-all grid in id(0). Some seqlens might be too
+        # small for all start_m so for those we return early.
+        if start_m * BLOCK_M > seqlen_q:
+            return
+        cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z)
+        cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1)
+        seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start
+    else:
+        cu_seqlens_q_start = 0
+        cu_seqlens_k_start = 0
+        seqlen_q = MAX_SEQLENS_Q
+        seqlen_k = MAX_SEQLENS_K
+
+    # Now we compute whether we need to exit early due to causal masking.
+    # This is because for seqlen_q > seqlen_k, M rows of the attn scores
+    # are completely masked, resulting in 0s written to the output, and
+    # inf written to LSE. We don't need to do any GEMMs in this case.
+    # This block of code determines what N is, and if this WG is operating
+    # on those M rows.
+    n_blocks = cdiv_fn(seqlen_k, BLOCK_N)
+    if IS_CAUSAL:
+        # If seqlen_q == seqlen_k, the attn scores are a square matrix.
+        # If seqlen_q != seqlen_k, attn scores are rectangular which means
+        # the causal mask boundary is bottom right aligned, and ends at either
+        # the top edge (seqlen_q < seqlen_k) or left edge.
+        # This captures the decrease in n_blocks if we have a rectangular attn
+        # matrix
+        n_blocks_seqlen = cdiv_fn(
+            (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N
+        )
+        # This is what adjusts the block_max for the current WG, only
+        # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks
+        n_blocks = min(n_blocks, n_blocks_seqlen)
+        # If we have no blocks after adjusting for seqlen deltas, this WG is
+        # part of the blocks that are all 0. We exit early.
+        if n_blocks <= 0:
+            o_offset = (
+                off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
+            )
+            O_block_ptr = tl.make_block_ptr(
+                base=Out + o_offset,
+                shape=(seqlen_q, BLOCK_DMODEL),
+                strides=(stride_om, stride_on),
+                offsets=(start_m * BLOCK_M, 0),
+                block_shape=(BLOCK_M, BLOCK_DMODEL),
+                order=(1, 0),
+            )
+            acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty)
+            # We still need to write 0s to the result
+            # tl.store(O_block_ptr,
+            # acc.to(Out.type.element_ty), boundary_check=(0,1))
+            # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q
+            #          + offs_m
+            # We store inf to LSE, not -inf because in the bwd pass,
+            # we subtract this
+            # from qk which makes it -inf, such that exp(qk - inf) = 0
+            # for these masked blocks.
+            # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32)
+            # tl.store(l_ptrs, l)
+            # TODO: Should dropout and return encoded softmax be handled here?
+            return
+
+    # If MQA / GQA, set the K and V head offsets appropriately.
+    GROUP_SIZE: tl.constexpr = HQ // HK
+    if GROUP_SIZE != 1:
+        off_h_k = off_h_q // GROUP_SIZE
+    else:
+        off_h_k = off_h_q
+
+    n_extra_tokens = 0
+    if seqlen_k < BLOCK_N:
+        n_extra_tokens = BLOCK_N - seqlen_k
+    elif seqlen_k % BLOCK_N:
+        n_extra_tokens = seqlen_k % BLOCK_N
+    PADDED_HEAD: tl.constexpr = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL
+
+    # Compute pointers for all the tensors used in this kernel.
+    q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(ACTUAL_BLOCK_DMODEL, seqlen_k),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(seqlen_k, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    if BIAS_TYPE != 0:
+        bias_ptr = tl.make_block_ptr(
+            base=bias + off_h_q * stride_bh,
+            shape=(seqlen_q, seqlen_k),
+            strides=(stride_bm, stride_bn),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        bias_ptr = None
+    if ENABLE_DROPOUT:
+        batch_philox_offset = (
+            philox_offset_base + (off_z * HQ + off_h_q) * seqlen_q * seqlen_k
+        )
+    else:
+        batch_philox_offset = 0
+    # We can ask to return the dropout mask without actually doing any dropout.
+    # In this case, we return an invalid pointer so indicate the mask is not i
+    # valid.
+    # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
+    if RETURN_ENCODED_SOFTMAX:
+        encoded_softmax_block_ptr = tl.make_block_ptr(
+            base=encoded_softmax + off_h_q * seqlen_q * seqlen_k,
+            shape=(seqlen_q, seqlen_k),
+            strides=(seqlen_k, 1),
+            offsets=(start_m * BLOCK_M, 0),
+            block_shape=(BLOCK_M, BLOCK_N),
+            order=(1, 0),
+        )
+    else:
+        encoded_softmax_block_ptr = 0
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use 2^x in the loop as we do not
+    # have native e^x support in HW.
+    qk_scale = sm_scale * 1.44269504089
+    # Q is loaded once at the beginning and shared by all N blocks.
+    q = load_fn(Q_block_ptr, True, PADDED_HEAD, "zero")
+    q = (q * qk_scale).to(Q_block_ptr.type.element_ty)
+
+    # Here we compute how many full and masked blocks we have.
+    padded_block_k = n_extra_tokens != 0
+    is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0)
+    if IS_CAUSAL:
+        # There are always at least BLOCK_M // BLOCK_N masked blocks.
+        # Additionally there might be one more due to dissimilar seqlens.
+        masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn)
+    else:
+        # Padding on Q does not need to be masked in the FA loop.
+        masked_blocks = padded_block_k
+    # if IS_CAUSAL, not is_modulo_mn does not always result in an additional
+    # block. In this case we might exceed n_blocks so pick the min.
+    masked_blocks = min(masked_blocks, n_blocks)
+    n_full_blocks = n_blocks - masked_blocks
+    block_min = 0
+    block_max = n_blocks * BLOCK_N
+    # Compute for full blocks. Here we set causal to false regardless of its
+    # value because there is no masking. Similarly we do not need padding.
+    if n_full_blocks > 0:
+        block_max = (n_blocks - masked_blocks) * BLOCK_N
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _
+            block_min,
+            block_max,
+            0,
+            0,
+            0,
+            bias_ptr,
+            # IS_CAUSAL, ....
+            False,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            False,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            PADDED_HEAD,
+        )
+        block_min = block_max
+        block_max = n_blocks * BLOCK_N
+
+    tl.debug_barrier()
+    # Remaining blocks, if any, are full / not masked.
+    if masked_blocks > 0:
+        offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0
+        K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0))
+        if bias_ptr is not None:
+            bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N))
+        if RETURN_ENCODED_SOFTMAX:
+            encoded_softmax_block_ptr = tl.advance(
+                encoded_softmax_block_ptr, (0, n_full_blocks)
+            )
+        acc, l_i, m_i = _attn_fwd_inner(
+            acc,
+            l_i,
+            m_i,
+            q,
+            K_block_ptr,
+            V_block_ptr,
+            start_m,
+            seqlen_k,
+            dropout_p,
+            philox_seed,
+            batch_philox_offset,
+            encoded_softmax_block_ptr,
+            block_min,
+            block_max,
+            offs_n_causal,
+            masked_blocks,
+            n_extra_tokens,
+            bias_ptr,
+            IS_CAUSAL,
+            BLOCK_M,
+            BLOCK_DMODEL,
+            BLOCK_N,
+            offs_m,
+            offs_n,
+            # _, MASK_STEPS, ...
+            PRE_LOAD_V,
+            True,
+            ENABLE_DROPOUT,
+            RETURN_ENCODED_SOFTMAX,
+            PADDED_HEAD,
+        )
+    # epilogue
+    acc = acc / l_i[:, None]
+    if ENABLE_DROPOUT:
+        acc = acc / (1 - dropout_p)
+    # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M,
+    # then we have one block with a row of all NaNs which come from computing
+    # softmax over a row of all -infs (-inf - inf = NaN). We check for that here
+    # and store 0s where there are NaNs as these rows should've been zeroed out.
+    end_m_idx = (start_m + 1) * BLOCK_M
+    start_m_idx = start_m * BLOCK_M
+    causal_start_idx = seqlen_q - seqlen_k
+    acc = acc.to(Out.type.element_ty)
+    if IS_CAUSAL:  # noqa: SIM102
+        if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx:
+            out_mask_boundary = tl.full(
+                (BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32
+            )
+            mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M)
+            out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :]
+            z = 0.0
+            acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty))
+    # write back LSE
+    # l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m
+    # If seqlen_q not multiple of BLOCK_M, we need to mask out the last
+    # few rows. This is only true for the last M block. For others,
+    # overflow_size will be -ve
+    # overflow_size = end_m_idx - seqlen_q
+    # if overflow_size > 0:
+    #    boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32)
+    #    # This is a > check because mask being 0 blocks the store.
+    #    l_ptrs_mask = boundary > tl.arange(0, BLOCK_M)
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask)
+    # else:
+    #    tl.store(l_ptrs, m_i + tl.math.log2(l_i))
+
+    # write back O
+    o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(seqlen_q, ACTUAL_BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # Need boundary check on this to make sure the padding from the
+    # Q and KV tensors in both dims are not part of what we store back.
+    # TODO: Do the boundary check optionally.
+    tl.store(O_block_ptr, acc, boundary_check=(0, 1))
+
+
+def check_args(
+    q,
+    k,
+    v,
+    o,
+    varlen=True,
+    max_seqlens=None,
+    cu_seqlens_q=None,
+    cu_seqlens_k=None,
+):
+    assert q.dim() == k.dim() and q.dim() == v.dim()
+    if varlen:
+        assert q.dim() == 3
+        total_q, nheads_q, head_size = q.shape
+        total_k, nheads_k, _ = k.shape
+        assert cu_seqlens_q is not None
+        assert cu_seqlens_k is not None
+        assert len(cu_seqlens_q) == len(cu_seqlens_k)
+    else:
+        assert q.dim() == 4
+        batch, nheads_q, seqlen_q, head_size = q.shape
+        _, nheads_k, seqlen_k, _ = k.shape
+        assert max_seqlens > 0
+    assert k.shape == v.shape
+    assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1]
+    # TODO: Change assert if we support qkl f8 and v f16
+    assert q.dtype == k.dtype and q.dtype == v.dtype
+    # TODO: Fix assert to check head size <=256 once supported
+    assert head_size <= 128
+    assert o.shape == q.shape
+    assert (nheads_q % nheads_k) == 0
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        o,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlens_q,
+        max_seqlens_k,
+        causal=False,
+        sm_scale=1.0,
+        bias=None,
+    ):
+        if o is None:
+            o = torch.empty_like(q, dtype=v.dtype)
+
+        check_args(
+            q,
+            k,
+            v,
+            o,
+            varlen=True,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+        )
+        if True:  # varlen
+            total_q, nheads_q, head_size = q.shape
+            total_k, nheads_k, _ = k.shape
+            batch = len(cu_seqlens_q) - 1
+            q_strides = (0, q.stride(1), q.stride(0), q.stride(2))
+            k_strides = (0, k.stride(1), k.stride(0), k.stride(2))
+            v_strides = (0, v.stride(1), v.stride(0), v.stride(2))
+            o_strides = (0, o.stride(1), o.stride(0), o.stride(2))
+        else:
+            batch, seqlen_q, nheads_q, head_size = q.shape
+            _, seqlen_k, nheads_k, _ = k.shape
+            q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3))
+            k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3))
+            v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3))
+            o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3))
+
+        # Get closest power of 2 over or equal to 32.
+        padded_d_model = 1 << (head_size - 1).bit_length()
+        padded_d_model = max(padded_d_model, 16)
+
+        grid = lambda META: (
+            triton.cdiv(max_seqlens_q, META["BLOCK_M"]),
+            nheads_q,
+            batch,
+        )
+
+        encoded_softmax = None
+
+        # Seed the RNG so we get reproducible results for testing.
+        philox_seed = 0x1BF52
+        philox_offset = 0x1D4B42
+
+        if bias is not None:
+            bias_strides = (
+                bias.stride(0),
+                bias.stride(1),
+                bias.stride(2),
+                bias.stride(3),
+            )
+        else:
+            bias_strides = (0, 0, 0, 0)
+
+        attn_fwd[grid](
+            q,
+            k,
+            v,
+            bias,
+            sm_scale,
+            None,
+            o,
+            *q_strides,
+            *k_strides,
+            *v_strides,
+            *o_strides,
+            *bias_strides,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            dropout_p=0.0,
+            philox_seed=philox_seed,
+            philox_offset_base=philox_offset,
+            encoded_softmax=encoded_softmax,
+            HQ=nheads_q,
+            HK=nheads_k,
+            ACTUAL_BLOCK_DMODEL=head_size,
+            MAX_SEQLENS_Q=max_seqlens_q,
+            MAX_SEQLENS_K=max_seqlens_k,
+            IS_CAUSAL=causal,
+            VARLEN=True,
+            BLOCK_DMODEL=padded_d_model,
+            BIAS_TYPE=0 if bias is None else 1,
+            ENABLE_DROPOUT=False,
+            RETURN_ENCODED_SOFTMAX=False,
+        )
+
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = head_size
+        ctx.causal = causal
+        ctx.dropout_p = 0.0
+        ctx.philox_seed = philox_seed
+        ctx.philox_offset = philox_offset
+        ctx.encoded_softmax = encoded_softmax
+        ctx.return_encoded_softmax = False
+        return o, encoded_softmax
+
+
+triton_attention = _attention.apply
diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py
new file mode 100644
index 00000000..45a0a03e
--- /dev/null
+++ b/server/text_generation_server/layers/attention/ipex.py
@@ -0,0 +1,75 @@
+import intel_extension_for_pytorch as ipex
+import torch
+from text_generation_server.models.flash_causal_lm import BLOCK_SIZE
+from text_generation_server.layers.attention import Seqlen
+
+SUPPORTS_WINDOWING = False
+
+
+def attention(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens,
+    max_s,
+    softmax_scale,
+    window_size_left=-1,
+    causal=True,
+):
+    # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
+    return ipex.llm.functional.varlen_attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        cu_seqlens,
+        max_s,
+        max_s,
+        0.0,
+        softmax_scale,
+        False,
+        causal,
+        False,
+        None,
+    )
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    ipex.llm.modules.PagedAttention.reshape_and_cache(
+        key, value, key_cache, value_cache, slots
+    )
+
+
+def paged_attention(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    block_tables: torch.Tensor,
+    seqlen: Seqlen,
+    max_s: int,
+):
+    ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
+        out,
+        query,
+        key_cache,
+        value_cache,
+        kv_head_mapping,
+        softmax_scale,
+        block_tables,
+        seqlen.input_lengths,
+        BLOCK_SIZE,
+        max_s,
+        None,
+    )
+    return out
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
new file mode 100644
index 00000000..99c490d5
--- /dev/null
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -0,0 +1,232 @@
+import os
+import torch
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import FLASH_DECODING
+from text_generation_server.layers.attention import Seqlen
+from loguru import logger
+
+major, minor = torch.cuda.get_device_capability()
+is_sm75 = major == 7 and minor == 5
+_PARTITION_SIZE = 512
+
+use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
+ENGINE = "triton" if use_triton else "ck"
+
+try:
+    from vllm._C import cache_ops
+    from vllm._C import ops
+except Exception as e:
+    raise ImportError(
+        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+    )
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    if FLASH_DECODING:
+        shape = key_cache.shape
+        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
+        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+    else:
+        cache_ops.reshape_and_cache(
+            key, value, key_cache, value_cache, slots, "auto", 1.0
+        )
+
+
+def paged_attention(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    kv_head_mapping: torch.Tensor,
+    softmax_scale: float,
+    block_tables: torch.Tensor,
+    input_lengths: Seqlen,
+    max_s: int,
+):
+    # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
+    # Copyright 2023 The vLLM team. All rights
+    # reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    #
+
+    # value_cache => [num_blocks, num_heads, head_size, block_size]
+    block_size = value_cache.shape[3]
+    num_seqs, num_heads, head_size = query.shape
+    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
+    input_lengths = input_lengths.input_lengths
+
+    # NOTE(woosuk): We use a simple heuristic to decide whether to use
+    # PagedAttention V1 or V2. If the number of partitions is 1, we use
+    # V1 to avoid the overhead of reduction. Also, if the number of
+    # sequences or heads is large, we use V1 since there is enough work
+    # to parallelize.
+    from vllm._C import ops
+
+    use_v1 = max_s <= 8192 and (max_num_partitions == 1 or num_seqs * num_heads > 512)
+    if use_v1:
+        ops.paged_attention_v1(
+            out,
+            query,
+            key_cache,
+            value_cache,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            input_lengths,
+            block_size,
+            max_s,
+            None,
+            "auto",
+            1.0,
+        )
+    else:
+        # Run PagedAttention V2.
+        assert _PARTITION_SIZE % block_size == 0
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions, head_size),
+            dtype=out.dtype,
+            device=out.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=out.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+        ops.paged_attention_v2(
+            out,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            kv_head_mapping,
+            softmax_scale,
+            block_tables,
+            input_lengths,
+            block_size,
+            max_s,
+            None,
+            "auto",
+            1.0,
+        )
+    return out
+
+
+if ENGINE != "triton":
+    try:
+        import flash_attn_2_cuda
+
+        logger.info("ROCm: using Flash Attention 2 Composable Kernel implementation.")
+    except ImportError as e:
+        if major >= 8:
+            architecture_suffix = f"-{SYSTEM}"
+            raise ImportError(
+                "Flash Attention V2 is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
+            )
+        elif is_sm75:
+            raise ImportError(
+                "Flash Attention is not installed.\n"
+                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                "or install flash attention with `cd server && make install install-flash-attention`"
+            ) from e
+        else:
+
+            for idx in range(torch.cuda.device_count()):
+                name = torch.cuda.get_device_name(idx)
+                if "MI210" not in name and "MI250" not in name:
+                    raise ImportError(
+                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
+                    )
+            raise ImportError(
+                f"AMD GPU with ROCm capability {major} {minor} is not supported"
+            ) from e
+
+
+SUPPORTS_WINDOWING = False
+if ENGINE == "ck":
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+        causal=True,
+    ):
+        if window_size_left <= 0 and window_size_left != -1:
+            raise ValueError("`window_size_left` must be > 0 or -1")
+
+        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
+        return flash_attn_2_cuda.varlen_fwd(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            0.0,
+            softmax_scale,
+            False,
+            causal,
+            False,
+            None,
+        )
+
+elif ENGINE == "triton":
+    from .flash_attn_triton import triton_attention
+
+    def attention(
+        q,
+        k,
+        v,
+        out,
+        cu_seqlens,
+        max_s,
+        softmax_scale,
+        window_size_left=-1,
+        causal=True,
+    ):
+        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
+        output, _ = triton_attention(
+            q,
+            k,
+            v,
+            out,
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            causal,
+            softmax_scale,
+        )
+        return output
+
+else:
+    raise RuntimeError(f"Unknown attention engine {ENGINE}")
diff --git a/server/text_generation_server/layers/awq/conversion_utils.py b/server/text_generation_server/layers/awq/conversion_utils.py
new file mode 100644
index 00000000..b19eafbb
--- /dev/null
+++ b/server/text_generation_server/layers/awq/conversion_utils.py
@@ -0,0 +1,97 @@
+import torch
+from typing import List
+
+
+AWQ_PACK_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
+REVERSE_AWQ_PACK_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+
+
+def pack(imatrix: torch.Tensor, direction: str = "column"):
+    """
+    Packs a 4-bit integer matrix into a packed 32-bit integer matrix.
+    Args:
+        imatrix (torch.Tensor): matrix of integers
+        direction (str): direction of packing, either "column" or "row"
+    Returns:
+        qmatrix (torch.Tensor): packed matrix of integers
+    """
+    shifts = torch.arange(0, 32, 4, dtype=torch.int32, device=imatrix.device)
+
+    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
+
+    if direction == "column":
+        imatrix = imatrix.view(-1, imatrix.shape[1] // (32 // 4), (32 // 4))
+        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, None, :]).sum(dim=-1)
+
+    elif direction == "row":
+        imatrix = imatrix.view(imatrix.shape[0] // (32 // 4), (32 // 4), -1)
+        qmatrix = torch.bitwise_left_shift(imatrix, shifts[None, :, None]).sum(dim=1)
+
+    qmatrix = qmatrix.to(torch.int32)
+
+    return qmatrix
+
+
+def unpack(qmatrix: torch.Tensor, direction: str = "column"):
+    """
+    Unpacks a 32-bit packed integer matrix into a 4-bit integer matrix.
+    Args:
+        qmatrix (torch.Tensor): matrix of packed integers
+        direction (str): direction of unpacking, either "column" or "row"
+    Returns:
+        imatrix (torch.Tensor): matrix of integers
+    """
+    shifts = torch.arange(0, 32, 4, device=qmatrix.device)
+
+    if direction == "column":
+        imatrix = torch.bitwise_right_shift(
+            qmatrix[:, :, None], shifts[None, None, :]
+        ).view(qmatrix.shape[0], -1)
+
+    elif direction == "row":
+        imatrix = torch.bitwise_right_shift(
+            qmatrix[:, None, :], shifts[None, :, None]
+        ).view(-1, qmatrix.shape[-1])
+
+    imatrix = imatrix.to(torch.int8) & 0x0F  # eventually correct overflow
+
+    return imatrix
+
+
+def apply_order(
+    imatrix: torch.Tensor,
+    direction: str = "column",
+    order: List[int] = AWQ_PACK_ORDER,
+):
+    """
+    Applies the order to a 4-bit integer matrix.
+    Args:
+        imatrix (torch.Tensor): matrix of integers
+        direction (str): direction of applying order, either "column" or "row"
+        order (List[int]): order to apply, default is AWQ_PACK_ORDER
+    Returns:
+        imatrix (torch.Tensor): matrix of integers
+    """
+    if direction == "column":
+        imatrix = imatrix.view(-1, (32 // 4))[:, order].view(imatrix.shape)
+    elif direction == "row":
+        imatrix = imatrix.view((32 // 4), -1)[order, :].view(imatrix.shape)
+
+    return imatrix
+
+
+def fast_awq_to_gptq(qweight, qzeros):
+    # awq uses column packing for both weights and zeros
+    izeros = unpack(qzeros, direction="column")
+    iweights = unpack(qweight, direction="column")
+
+    # Reverse the order of the iweight and izeros tensors
+    izeros = apply_order(izeros, direction="column", order=REVERSE_AWQ_PACK_ORDER)
+    iweights = apply_order(iweights, direction="column", order=REVERSE_AWQ_PACK_ORDER)
+    # Subtract 1 from the izeros tensor (gptq adds 1 to the zeros)
+    izeros = izeros - 1
+    # exllama uses row packing for weights and column packing for zeros
+    qzeros = pack(izeros, direction="column")
+    qweight = pack(iweights, direction="row")
+
+    return qweight, qzeros
diff --git a/server/text_generation_server/layers/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/qmodule.py
new file mode 100644
index 00000000..c859db1b
--- /dev/null
+++ b/server/text_generation_server/layers/awq/quantize/qmodule.py
@@ -0,0 +1,50 @@
+# Copied logic from https://github.com/mit-han-lab/llm-awq/blob/f084f40bd996f3cf3a0633c1ad7d9d476c318aaa/awq/quantize/qmodule.py
+
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import awq_inference_engine  # with CUDA kernels
+
+
+# class ScaledActivation(nn.Module):
+#     def __init__(self, module, scales):
+#         super().__init__()
+#         self.act = module
+#         self.scales = nn.Parameter(scales.data)
+#
+#     def forward(self, x):
+#         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
+
+
+class WQLinear(nn.Module):
+    def __init__(
+        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
+    ):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.bias = bias
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        out = awq_inference_engine.gemm_forward_cuda(
+            x.reshape(-1, x.shape[-1]), self.qweight, self.scales, self.qzeros, 8
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py
new file mode 100644
index 00000000..ca39919c
--- /dev/null
+++ b/server/text_generation_server/layers/bnb.py
@@ -0,0 +1,106 @@
+import torch
+from loguru import logger
+from functools import lru_cache
+import bitsandbytes as bnb
+from bitsandbytes.nn import Int8Params, Params4bit
+
+
+@lru_cache(1)
+def warn_deprecate_bnb():
+    logger.warning(
+        "Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce"
+    )
+
+
+class Linear8bitLt(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+        has_fp16_weights=True,
+        memory_efficient_backward=False,
+        threshold=0.0,
+        index=None,
+    ):
+        super().__init__()
+        assert (
+            not memory_efficient_backward
+        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
+        self.state = bnb.MatmulLtState()
+        self.index = index
+
+        # Necessary for stacked layers
+        self.state.threshold = threshold
+        self.state.has_fp16_weights = has_fp16_weights
+        self.state.memory_efficient_backward = memory_efficient_backward
+        if threshold > 0.0 and not has_fp16_weights:
+            self.state.use_pool = True
+
+        self.weight = Int8Params(
+            weight.data,
+            has_fp16_weights=has_fp16_weights,
+            requires_grad=has_fp16_weights,
+        )
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def init_8bit_state(self):
+        self.state.CB = self.weight.CB
+        self.state.SCB = self.weight.SCB
+        self.weight.CB = None
+        self.weight.SCB = None
+
+    def forward(self, x: torch.Tensor):
+        self.state.is_training = self.training
+        if self.weight.CB is not None:
+            self.init_8bit_state()
+
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
+
+        if not self.state.has_fp16_weights:
+            if self.state.CB is not None and self.state.CxB is not None:
+                # we converted 8-bit row major to turing/ampere format in the first inference pass
+                # we no longer need the row-major weight
+                del self.state.CB
+                self.weight.data = self.state.CxB
+        return out
+
+
+class Linear4bit(torch.nn.Module):
+    def __init__(self, weight, bias, quant_type):
+        super().__init__()
+        self.weight = Params4bit(
+            weight.data,
+            requires_grad=False,
+            compress_statistics=True,
+            quant_type=quant_type,
+        )
+        self.compute_dtype = None
+        self.weight.cuda(weight.device)
+        self.bias = bias
+
+    def forward(self, x: torch.Tensor):
+        # weights are cast automatically as Int8Params, but the bias has to be cast manually
+        if self.bias is not None and self.bias.dtype != x.dtype:
+            self.bias.data = self.bias.data.to(x.dtype)
+
+        if getattr(self.weight, "quant_state", None) is None:
+            print(
+                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
+            )
+        inp_dtype = x.dtype
+        if self.compute_dtype is not None:
+            x = x.to(self.compute_dtype)
+
+        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+        out = bnb.matmul_4bit(
+            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
+        )
+
+        out = out.to(inp_dtype)
+
+        return out
diff --git a/server/text_generation_server/layers/conv.py b/server/text_generation_server/layers/conv.py
new file mode 100644
index 00000000..7fb18ab3
--- /dev/null
+++ b/server/text_generation_server/layers/conv.py
@@ -0,0 +1,41 @@
+from accelerate import init_empty_weights
+import torch
+
+
+@classmethod
+def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = torch.nn.Parameter(bias)
+    return conv2d
+
+
+@classmethod
+def load_conv2d_no_bias(
+    cls, prefix, weights, in_channels, out_channels, kernel_size, stride
+):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        conv2d = cls(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+        )
+
+    conv2d.weight = torch.nn.Parameter(weight)
+    conv2d.bias = None
+    return conv2d
+
+
+torch.nn.Conv2d.load = load_conv2d
+torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py
new file mode 100644
index 00000000..fd22b5c6
--- /dev/null
+++ b/server/text_generation_server/layers/eetq.py
@@ -0,0 +1,25 @@
+import torch
+from EETQ import quant_weights, w8_a16_gemm
+
+
+class EETQLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        device = weight.device
+        if weight.dtype != torch.float16:
+            weight = weight.to(dtype=torch.float16)
+        weight = torch.t(weight).contiguous().cpu()
+        weight, scale = quant_weights(weight, torch.int8, False)
+
+        self.weight = weight.cuda(device)
+        self.scale = scale.cuda(device)
+        self.bias = bias.cuda(device) if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        output = w8_a16_gemm(input, self.weight, self.scale)
+        output = output + self.bias if self.bias is not None else output
+        return output
diff --git a/server/text_generation_server/layers/exl2.py b/server/text_generation_server/layers/exl2.py
new file mode 100644
index 00000000..f6cb729e
--- /dev/null
+++ b/server/text_generation_server/layers/exl2.py
@@ -0,0 +1,23 @@
+import torch
+from dataclasses import dataclass
+
+
+@dataclass
+class Exl2Weight:
+    """
+    Exllama2 exl2 quantized weights.
+    """
+
+    q_weight: torch.Tensor
+    q_scale: torch.Tensor
+    q_invperm: torch.Tensor
+    q_scale_max: torch.Tensor
+    q_groups: torch.Tensor
+
+    def __post_init__(self):
+        self.q_scale_max /= 256
+        self.q_invperm = self.q_invperm.short()
+
+    @property
+    def device(self) -> torch.device:
+        return self.q_weight.device
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
new file mode 100644
index 00000000..dd61d081
--- /dev/null
+++ b/server/text_generation_server/layers/fp8.py
@@ -0,0 +1,43 @@
+import torch
+
+
+def fp8_quantize(weight, qdtype=torch.float8_e4m3fn):
+    device = weight.device
+    # weight, scale = quant_weights(weight, torch.int8, False)
+    finfo = torch.finfo(qdtype)
+    # Calculate the scale as dtype max divided by absmax
+    scale = finfo.max / weight.abs().max().clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(qdtype)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+
+
+class Fp8Linear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.dtype = weight.dtype
+        self.qweight, self.scale = fp8_quantize(weight)
+
+        self.bias = bias if bias is not None else None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        qinput, scale = fp8_quantize(input)
+        output, _ = torch._scaled_mm(
+            qinput,
+            self.qweight.t(),
+            out_dtype=self.dtype,
+            scale_a=scale,
+            scale_b=self.scale,
+            bias=self.bias,
+        )
+        return output
diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
new file mode 100644
index 00000000..56080145
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -0,0 +1,71 @@
+from dataclasses import dataclass
+import os
+from typing import Optional
+import torch
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+
+@dataclass
+class GPTQParams:
+    bits: int
+    checkpoint_format: Optional[str]
+    groupsize: int
+    desc_act: bool
+    quant_method: str
+    sym: bool
+
+
+@dataclass
+class GPTQWeight:
+    qweight: torch.Tensor
+    qzeros: torch.Tensor
+    scales: torch.Tensor
+    g_idx: Optional[torch.Tensor]
+    bits: int
+    groupsize: int
+    use_exllama: bool
+
+    def __post_init__(self):
+        if self.scales.dtype == torch.float:
+            self.scales = self.scales.half()
+
+    @property
+    def device(self) -> torch.device:
+        return self.qweight.device
+
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+except Exception:
+    major = 1
+
+HAS_EXLLAMA = False
+CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
+V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
+if os.getenv("DISABLE_EXLLAMA") == "True":
+    HAS_EXLLAMA = False
+elif CAN_EXLLAMA:
+    try:
+        if V2:
+            from text_generation_server.layers.gptq.exllamav2 import (
+                QuantLinear as ExllamaQuantLinear,
+                create_exllama_buffers,
+                set_device,
+            )
+
+            HAS_EXLLAMA = "2"
+        else:
+            from text_generation_server.layers.gptq.exllama import (
+                Ex4bitLinear as ExllamaQuantLinear,
+                create_exllama_buffers,
+                set_device,
+            )
+
+            HAS_EXLLAMA = "1"
+
+    except ImportError:
+        pass
+
+from text_generation_server.layers.gptq.quant_linear import QuantLinear
diff --git a/server/text_generation_server/utils/gptq/custom_autotune.py b/server/text_generation_server/layers/gptq/custom_autotune.py
similarity index 98%
rename from server/text_generation_server/utils/gptq/custom_autotune.py
rename to server/text_generation_server/layers/gptq/custom_autotune.py
index 17dff02e..1eb40f1e 100644
--- a/server/text_generation_server/utils/gptq/custom_autotune.py
+++ b/server/text_generation_server/layers/gptq/custom_autotune.py
@@ -88,9 +88,9 @@ class Autotuner(triton.KernelInterface):
             # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
             # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
             return triton.testing.do_bench(
-                kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40
+                kernel_call, quantiles=(0.5, 0.2, 0.8), rep=40
             )
-        except triton.compiler.OutOfResources:
+        except triton.OutOfResources:
             return (float("inf"), float("inf"), float("inf"))
 
     def run(self, *args, **kwargs):
diff --git a/server/text_generation_server/utils/gptq/exllama.py b/server/text_generation_server/layers/gptq/exllama.py
similarity index 76%
rename from server/text_generation_server/utils/gptq/exllama.py
rename to server/text_generation_server/layers/gptq/exllama.py
index 6a1cf117..f27666b7 100644
--- a/server/text_generation_server/utils/gptq/exllama.py
+++ b/server/text_generation_server/layers/gptq/exllama.py
@@ -1,3 +1,4 @@
+from text_generation_server.layers.gptq import GPTQWeight
 import torch
 from exllama_kernels import make_q4, q4_matmul, prepare_buffers, set_tuning_params
 
@@ -37,19 +38,12 @@ def set_device(device):
     DEVICE = device
 
 
-def create_exllama_buffers():
+def create_exllama_buffers(max_total_tokens: int):
     global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE, TEMP_STATE, TEMP_DQ
 
     assert DEVICE is not None, "call set_device first"
 
-    if ACT_ORDER:
-        # TODO: this should be set to rust side `max_total_tokens`, but TGI
-        # does not offer an API to expose this variable to python, as this variable
-        # is handled by the client but it appears the model is initialized by the server.
-        # An alternative could be to initialize the buffers during warmup.
-        # Dummy
-        max_total_tokens = 2048
-    else:
+    if not ACT_ORDER:
         max_total_tokens = 1
 
     # This temp_state buffer is required to reorder X in the act-order case.
@@ -69,26 +63,28 @@ def create_exllama_buffers():
     TEMP_STATE, TEMP_DQ = temp_state, temp_dq
 
 
-class Ex4bitLinear:
+class Ex4bitLinear(torch.nn.Module):
     """Linear layer implementation with per-group 4-bit quantization of the weights"""
 
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+    def __init__(self, weight: GPTQWeight, bias):
+        super().__init__()
         global MAX_DQ, MAX_INNER, ACT_ORDER, DEVICE
-        assert bits == 4
+        assert weight.bits == 4
 
-        self.device = qweight.device
-        self.qweight = qweight
-        self.qzeros = qzeros
-        self.scales = scales
-        self.g_idx = g_idx.cpu() if g_idx is not None else None
+        self.device = weight.qweight.device
+        self.qweight = weight.qweight
+        self.qzeros = weight.qzeros
+        self.scales = weight.scales
+        self.g_idx = weight.g_idx.cpu() if weight.g_idx is not None else None
         self.bias = bias if bias is not None else None
 
         if self.g_idx is not None and (
             (self.g_idx == 0).all()
             or torch.equal(
-                g_idx.cpu(),
+                weight.g_idx.cpu(),
                 torch.tensor(
-                    [i // groupsize for i in range(g_idx.shape[0])], dtype=torch.int32
+                    [i // weight.groupsize for i in range(weight.g_idx.shape[0])],
+                    dtype=torch.int32,
                 ),
             )
         ):
@@ -102,8 +98,8 @@ class Ex4bitLinear:
             self.qweight, self.qzeros, self.scales, self.g_idx, self.device.index
         )
 
-        self.height = qweight.shape[0] * 8
-        self.width = qweight.shape[1]
+        self.height = weight.qweight.shape[0] * 8
+        self.width = weight.qweight.shape[1]
 
         # Infer groupsize from height of qzeros
         self.groupsize = None
@@ -111,7 +107,7 @@ class Ex4bitLinear:
             self.groupsize = (self.qweight.shape[0] * 8) // (self.qzeros.shape[0])
 
         if self.groupsize is not None:
-            assert groupsize == self.groupsize
+            assert weight.groupsize == self.groupsize
 
         # Handle act-order matrix
         if self.g_idx is not None:
diff --git a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py
new file mode 100644
index 00000000..4d45822b
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/exllamav2.py
@@ -0,0 +1,253 @@
+# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
+
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+
+from loguru import logger
+
+from text_generation_server.layers.exl2 import Exl2Weight
+from text_generation_server.layers.gptq import GPTQWeight
+
+try:
+    from exllamav2_kernels import make_q_matrix, gemm_half_q_half
+except ImportError:
+    logger.error("exllamav2_kernels not installed.")
+    raise
+
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+
+
+@dataclass
+class _ExtraTensors:
+    """Additional generated quantizer tensors."""
+
+    q_group_map: Optional[torch.Tensor] = None
+    q_invperm: Optional[torch.Tensor] = None
+    q_perm: Optional[torch.Tensor] = None
+
+
+def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
+    """Matrix multiplication, returns x @ q4"""
+    output_shape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
+    gemm_half_q_half(x, q_handle, output, force_cuda)
+    return output.view(output_shape)
+
+
+def make_group_map(q_groups: torch.Tensor, num_qrows: int):
+    gr = q_groups.tolist()
+    group_map = []
+    num_groups = len(gr) // 2
+
+    for i in range(num_groups):
+        bits = gr[i * 2]
+        if i < num_groups - 1:
+            qrows = gr[i * 2 + 3] - gr[i * 2 + 1]
+        else:
+            qrows = num_qrows - gr[i * 2 + 1]
+        rows = qrows * 32 // bits
+        for j in range(rows):
+            group_map += [i]
+            group_map += [rows - j]
+
+    return torch.tensor(group_map, dtype=torch.short, device=q_groups.device)
+
+
+# Create Q matrix
+
+
+def ext_make_q_matrix(
+    w: Exl2Weight | GPTQWeight,
+    extra: _ExtraTensors,
+    temp_dq,
+    key: Optional[str] = None,
+):
+    """
+    Create Q matrix
+    """
+    # EXL2
+    if isinstance(w, Exl2Weight):
+        extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0])
+        extra.q_perm = torch.argsort(w.q_invperm).short()
+
+        return make_q_matrix(
+            w.q_weight,
+            extra.q_perm,
+            w.q_invperm,
+            w.q_scale,
+            w.q_scale_max,
+            w.q_groups,
+            extra.q_group_map,
+            none_tensor,
+            none_tensor,
+            none_tensor,
+            temp_dq,
+        )
+    # GPTQ
+    elif isinstance(w, GPTQWeight):
+        if w.scales.dtype == torch.float:
+            w.scales = w.scales.half()
+
+        # GPTQ with g_idx (act_order)
+        if w.g_idx is not None and not (w.g_idx == 0).all().item():
+            extra.q_perm = torch.empty(
+                (w.qweight.shape[0] * 8,),
+                dtype=torch.short,
+                device=w.qweight.device,
+            )
+            extra.q_invperm = torch.empty_like(extra.q_perm)
+            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
+            return make_q_matrix(
+                w.qweight,
+                extra.q_perm,
+                extra.q_invperm,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                w.qzeros,
+                w.scales,
+                w.g_idx.cpu(),
+                temp_dq,
+            )
+        # GPTQ without g_idx
+        else:
+            return make_q_matrix(
+                w.qweight,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                w.qzeros,
+                w.scales,
+                none_tensor,
+                temp_dq,
+            )
+    else:
+        RuntimeError("Cannot create handle")
+
+
+DEVICE = None
+LAYERS = []
+
+
+def set_device(device):
+    global DEVICE
+    DEVICE = device
+
+
+def create_exllama_buffers(max_total_tokens: int):
+    global LAYERS, DEVICE
+
+    # No need to initialize scratch space if there are no layers
+    # that use ExLLamav2.
+    if len(LAYERS) == 0:
+        return
+
+    # Find the size of the scratch space.
+    scratch_bytes = max(
+        layer.scratch_space_fixed(max_input_len=max_total_tokens, max_batch_size=1)
+        for layer in LAYERS
+    )
+    temp_dq = ExLlamaV2DeviceTensors(DEVICE, scratch_bytes)
+
+    for layer in LAYERS:
+        layer.post_init(temp_dq)
+
+
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "exllamav2"
+
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+
+    def __init__(
+        self,
+        weight: Exl2Weight | GPTQWeight,
+        bias: torch.Tensor,
+    ):
+        super().__init__()
+
+        self.q_handle = None
+        self.q_tensors = weight
+        self.extra_tensors = _ExtraTensors()
+
+        if isinstance(weight, Exl2Weight):
+            self.infeatures = weight.q_invperm.shape[0]
+            self.outfeatures = weight.q_weight.shape[1]
+        elif isinstance(weight, GPTQWeight):
+            if weight.bits != 4:
+                raise ValueError(
+                    f"Exllamav2 kernel supports only bits=4, requested bits={weight.bits}. Something is wrong in the model initialization."
+                )
+
+            self.infeatures = weight.qweight.shape[0] // weight.bits * 32
+            self.outfeatures = weight.qweight.shape[1]
+
+        self.padding = -self.outfeatures % 32
+        self.outfeatures = self.outfeatures + self.padding
+
+        self.device = weight.device
+        self.bias = bias if bias is not None else None
+
+        global LAYERS
+        LAYERS.append(self)
+
+    def post_init(self, temp_dq):
+        device = self.q_tensors.device
+        assert device.type == "cuda"
+        assert device.index is not None
+        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
+
+        # We NEED to keep a pointer on Python side, otherwise the garbage collector will mess with us,
+        # and `Memory access fault by GPU node-2` will EAT you.
+        self.temp_dq = temp_dq
+        self.q_handle = ext_make_q_matrix(self.q_tensors, self.extra_tensors, temp_dq)
+
+    def forward(self, x, force_cuda=False):
+        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
+
+        if self.bias is not None:
+            output.add_(self.bias)
+        return output
+
+    def temp_dq_size(self):
+        return self.infeatures * self.outfeatures * 2 + 128
+
+    def temp_fwd_size(self, max_input_len, max_batch_size):
+        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
+
+    def scratch_space_fixed(self, max_input_len, max_batch_size):
+        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
+
+
+class ExLlamaV2DeviceTensors:
+
+    device_idx: int
+    scratch_bytes: int
+    scratch_idx: int
+    scratch: torch.tensor = None
+
+    def __init__(self, device, scratch_bytes):
+        self.device = device
+        self.scratch_bytes = scratch_bytes
+
+    def prepare(self):
+        self.scratch = torch.empty(
+            (self.scratch_bytes // 2,), dtype=torch.half, device=self.device
+        )
+
+    def get_scratch_slice(self, size_bytes):
+
+        if self.scratch is None:
+            self.prepare()
+
+        size_bytes = ((size_bytes + 127) // 128) * 128
+        size_half = size_bytes // 2
+        scratch_slice = self.scratch.narrow(0, 0, size_half)
+        return scratch_slice
diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/quant_linear.py
new file mode 100644
index 00000000..f60758b6
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/quant_linear.py
@@ -0,0 +1,356 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.cuda.amp import custom_fwd
+
+import triton
+import triton.language as tl
+from . import custom_autotune
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=4,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (
+        offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
+    )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk
+        + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(
+            scales_ptrs + g_idx[:, None] * stride_scales
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(
+            zeros_ptrs + g_idx[:, None] * stride_zeros
+        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = (zeros + 1) & maxq  # eventually avoid overflow
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty(
+            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
+        )
+        grid = lambda META: (
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
+            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [2, 4, 8]:
+            raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,4,8 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = QuantLinearFunction.apply(
+            x.reshape(-1, x.shape[-1]),
+            self.qweight,
+            self.scales,
+            self.qzeros,
+            self.g_idx,
+            self.bits,
+            self.maxq,
+        )
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
similarity index 99%
rename from server/text_generation_server/utils/gptq/quantize.py
rename to server/text_generation_server/layers/gptq/quantize.py
index 9547d534..8d029817 100644
--- a/server/text_generation_server/utils/gptq/quantize.py
+++ b/server/text_generation_server/layers/gptq/quantize.py
@@ -12,7 +12,7 @@ from huggingface_hub import HfApi
 from accelerate import init_empty_weights
 from text_generation_server.utils import initialize_torch_distributed, Weights
 from text_generation_server.utils.hub import weight_files
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
+from text_generation_server.layers.gptq.quant_linear import QuantLinear
 from loguru import logger
 from typing import Optional
 
@@ -578,7 +578,9 @@ def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
     return trainloader, valenc
 
 
-def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False):
+def get_loaders(
+    name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False
+):
     if "wikitext2" in name:
         return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
     if "ptb" in name:
@@ -927,7 +929,7 @@ def quantize(
         seed=seed,
         model_id=model_id,
         seqlen=model.seqlen,
-        trust_remote_code=trust_remote_code
+        trust_remote_code=trust_remote_code,
     )
 
     tick = time.time()
diff --git a/server/text_generation_server/layers/layernorm.py b/server/text_generation_server/layers/layernorm.py
new file mode 100644
index 00000000..ce5289f9
--- /dev/null
+++ b/server/text_generation_server/layers/layernorm.py
@@ -0,0 +1,184 @@
+import torch
+from torch import nn
+from accelerate import init_empty_weights
+from text_generation_server.utils.import_utils import (
+    SYSTEM,
+)
+
+
+# Monkey patching
+@classmethod
+def load_layer_norm(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    bias = weights.get_tensor(f"{prefix}.bias")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = torch.nn.Parameter(bias)
+    return ln
+
+
+@classmethod
+def load_layer_norm_no_bias(cls, prefix, weights, eps):
+    weight = weights.get_tensor(f"{prefix}.weight")
+    with init_empty_weights():
+        ln = cls(weight.shape, eps=eps)
+
+    ln.weight = torch.nn.Parameter(weight)
+    ln.bias = None
+    return ln
+
+
+torch.nn.LayerNorm.load = load_layer_norm
+torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if hidden_states.shape[-1] > 8192:
+                if residual is not None:
+                    hidden_states += residual
+                residual = hidden_states
+
+                return super(FastLayerNorm, self).forward(hidden_states), residual
+            else:
+                (
+                    normed_hidden_states,
+                    residual,
+                    *rest,
+                ) = dropout_layer_norm.dropout_add_ln_fwd(
+                    hidden_states,
+                    residual,
+                    self.weight,
+                    self.bias,
+                    None,
+                    None,
+                    None,
+                    None,
+                    0.0,
+                    self.eps,
+                    1.0,
+                    0,
+                    None,
+                    False,
+                    False,
+                )
+                if residual is None:
+                    residual = hidden_states
+
+                return normed_hidden_states, residual
+
+elif SYSTEM == "rocm":
+    from vllm._C import ops
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            return super().forward(hidden_states), residual
+
+elif SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+    class FastLayerNorm(nn.LayerNorm):
+        def forward(self, hidden_states, residual=None):
+            out = ipex.llm.functional.add_layer_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                self.bias,
+                self.eps,
+                residual is not None,
+            )
+            return out, residual if residual is not None else hidden_states
+
+
+class FastRMSNorm(nn.Module):
+    def __init__(self, weight: torch.Tensor, eps: float):
+        super().__init__()
+
+        self.weight = nn.Parameter(weight)
+        self.variance_epsilon = eps
+
+    @classmethod
+    def load(cls, prefix, weights, eps=1e-6):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        return cls(weight, eps)
+
+    def forward(self, hidden_states, residual=None):
+        if SYSTEM == "ipex":
+            out = ipex.llm.functional.add_rms_norm(
+                residual,
+                hidden_states,
+                self.weight,
+                None,
+                self.variance_epsilon,
+                residual is not None,
+            )
+            return out, residual if residual is not None else hidden_states
+        elif hidden_states.shape[-1] > 8192:
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            hidden_states = hidden_states.to(torch.float32)
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states * torch.rsqrt(
+                variance + self.variance_epsilon
+            )
+
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+
+            return self.weight * hidden_states, residual
+        elif SYSTEM == "cuda":
+            # faster post attention rms norm
+            (
+                normed_hidden_states,
+                res,
+                *rest,
+            ) = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states,
+                residual,
+                self.weight,
+                None,
+                None,
+                None,
+                None,
+                None,
+                0.0,
+                self.variance_epsilon,
+                1.0,
+                0,
+                None,
+                False,
+                True,  # Activate RMSNorm
+            )
+            if res is None:
+                res = hidden_states
+
+            return normed_hidden_states, res
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            out = torch.empty_like(hidden_states)
+            ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return out, residual
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py
new file mode 100644
index 00000000..e94e5465
--- /dev/null
+++ b/server/text_generation_server/layers/linear.py
@@ -0,0 +1,256 @@
+from typing import Optional
+import torch
+from torch.nn import functional as F
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+
+
+class FastLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.bias = None
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+
+class FastLinearROCm(torch.nn.Module):
+    def __init__(
+        self,
+        weight,
+        bias,
+    ) -> None:
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight)
+        if bias is not None:
+            self.bias = torch.nn.Parameter(bias)
+        else:
+            self.bias = None
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_tensor(f"{prefix}.weight")
+        if bias:
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(weight, bias)
+
+    def forward(self, inp: torch.Tensor) -> torch.Tensor:
+        weight = self.weight
+        bias = self.bias
+
+        if SYSTEM == "rocm" and inp.numel() // inp.shape[-1] == 1:
+            batched = False
+            inp_shape = inp.shape
+
+            if inp.dim() == 3:
+                inp = inp.view(-1, inp_shape[-1])
+                batched = True
+
+            m, k = weight.shape[0], inp_shape[1]
+            out = torch.empty(
+                inp_shape[0], weight.shape[0], dtype=inp.dtype, device="cuda"
+            )
+            if (k == 8192 and (m == 1280 or m == 7168)) or (k == 3584 and m == 8192):
+                _custom_C.LLMM1(weight, inp, out, 8)
+            elif k <= 8192 and k % 8 == 0 and m % 4 == 0:
+                _custom_C.LLMM1(weight, inp, out, 4)
+            else:
+                out = F.linear(inp, weight)
+
+            if batched:
+                out.view(*inp_shape[:-1], out.shape[-1])
+
+            if bias is not None:
+                out = out + bias
+            return out
+        return F.linear(inp, self.weight, self.bias)
+
+
+def get_linear(weight, bias, quantize):
+    if quantize is None:
+        if SYSTEM == "rocm":
+            linear = FastLinearROCm(weight, bias)
+        else:
+            linear = FastLinear(weight, bias)
+    elif quantize == "eetq":
+        try:
+            from text_generation_server.layers.eetq import EETQLinear
+
+            linear = EETQLinear(weight, bias)
+        except ImportError:
+            raise ImportError(
+                "Please install EETQ from https://github.com/NetEase-FuXi/EETQ"
+            )
+    elif quantize == "fp8":
+        from text_generation_server.layers.fp8 import Fp8Linear
+
+        linear = Fp8Linear(weight, bias)
+    elif quantize == "bitsandbytes":
+        try:
+            from text_generation_server.layers.bnb import (
+                warn_deprecate_bnb,
+                Linear8bitLt,
+            )
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        warn_deprecate_bnb()
+        linear = Linear8bitLt(
+            weight,
+            bias,
+            has_fp16_weights=False,
+            threshold=6.0,
+        )
+        if bias is not None:
+            linear.bias = nn.Parameter(bias)
+    elif quantize == "bitsandbytes-fp4":
+        try:
+            from text_generation_server.layers.bnb import Linear4bit
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        linear = Linear4bit(
+            weight,
+            bias,
+            quant_type="fp4",
+        )
+    elif quantize == "bitsandbytes-nf4":
+        try:
+            from text_generation_server.layers.bnb import Linear4bit
+        except ImportError:
+            raise NotImplementedError(
+                f"Bitsandbytes is missing install it with `pip install bitsandbytes`."
+            )
+        linear = Linear4bit(
+            weight,
+            bias,
+            quant_type="nf4",
+        )
+    elif quantize == "exl2":
+        from text_generation_server.layers.exl2 import Exl2Weight
+
+        if not isinstance(weight, Exl2Weight):
+            raise NotImplementedError(
+                f"The passed weight is not `exl2` compatible, loader needs to be updated."
+            )
+
+        from text_generation_server.layers.gptq import ExllamaQuantLinear
+
+        linear = ExllamaQuantLinear(weight, bias)
+
+    elif quantize == "gptq":
+        from text_generation_server.layers.gptq import GPTQWeight
+        from text_generation_server.layers.marlin import (
+            GPTQMarlinLinear,
+            GPTQMarlinWeight,
+        )
+
+        if isinstance(weight, GPTQMarlinWeight):
+            linear = GPTQMarlinLinear(
+                weight=weight,
+                bias=bias,
+            )
+        elif isinstance(weight, GPTQWeight):
+            if weight.use_exllama:
+                try:
+                    from text_generation_server.layers.gptq import (
+                        ExllamaQuantLinear,
+                    )
+                except ImportError:
+                    raise NotImplementedError(
+                        f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`"
+                    )
+
+                linear = ExllamaQuantLinear(weight, bias)
+            else:
+                from text_generation_server.layers.gptq.quant_linear import QuantLinear
+
+                linear = QuantLinear(
+                    weight.qweight,
+                    weight.qzeros,
+                    weight.scales,
+                    weight.g_idx,
+                    bias,
+                    weight.bits,
+                    weight.groupsize,
+                )
+        else:
+            raise NotImplementedError(
+                f"The passed weight is not `gptq` compatible, loader needs to be updated."
+            )
+
+    elif quantize == "awq":
+        from text_generation_server.layers.gptq import GPTQWeight
+
+        if not isinstance(weight, GPTQWeight):
+            raise NotImplementedError(
+                f"The passed weight is not `awq` compatible, loader needs to be updated."
+            )
+        if SYSTEM == "rocm":
+            raise NotImplementedError(
+                "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead "
+                "to use Exllama/GPTQ kernels for AWQ inference."
+            )
+        try:
+            from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+
+            linear = WQLinear(
+                w_bit=weight.bits,
+                group_size=weight.groupsize,
+                qweight=weight.qweight,
+                qzeros=weight.qzeros,
+                scales=weight.scales,
+                bias=bias,
+            )
+        except ImportError:
+            raise NotImplementedError(
+                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+            )
+    elif quantize == "marlin":
+        from text_generation_server.layers.marlin import (
+            GPTQMarlin24Linear,
+            GPTQMarlin24Weight,
+            MarlinLinear,
+            MarlinWeight,
+        )
+
+        if isinstance(weight, GPTQMarlin24Weight):
+            linear = GPTQMarlin24Linear(
+                weight=weight,
+                bias=bias,
+            )
+        elif isinstance(weight, MarlinWeight):
+            linear = MarlinLinear(weight=weight, bias=bias)
+        else:
+            raise NotImplementedError(
+                f"The passed weight is not `marlin` compatible, loader needs to be updated."
+            )
+    else:
+        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
+    return linear
diff --git a/server/text_generation_server/layers/lora.py b/server/text_generation_server/layers/lora.py
new file mode 100644
index 00000000..0bb6db41
--- /dev/null
+++ b/server/text_generation_server/layers/lora.py
@@ -0,0 +1,286 @@
+import math
+import os
+from typing import TYPE_CHECKING, Optional, Tuple, List
+
+import torch
+import torch.distributed
+from accelerate import init_empty_weights
+from torch import nn
+from torch.nn import functional as F
+from torch.distributed import ProcessGroup
+
+from text_generation_server.utils.sgmv import (
+    add_lora_a_bgmv,
+    add_lora_b_bgmv,
+    has_sgmv,
+    lora_a_sgmv_cutlass,
+    lora_b_sgmv_cutlass,
+    orient_for_rank,
+)
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters import AdapterBatchData
+    from text_generation_server.adapters.lora import BatchLoraWeights
+
+
+class LoraLinear(nn.Module):
+    def __init__(
+        self, base_layer: nn.Module, layer_id: int, process_group: ProcessGroup
+    ):
+        super().__init__()
+        self.base_layer = base_layer
+        self.layer_id = layer_id
+        self.process_group = process_group
+
+    def forward_layer_type(
+        self,
+        result: torch.Tensor,
+        input: torch.Tensor,
+        adapter_data: "AdapterBatchData",
+        layer_type: str,
+        start_idx: int,
+        end_idx: int,
+    ) -> torch.Tensor:
+        if adapter_data is None:
+            return result
+        data = adapter_data.data.get(layer_type)
+        data: Optional["BatchLoraWeights"] = (
+            data.get("lora") if data is not None else None
+        )
+
+        if has_sgmv() and data is not None and data.can_vectorize(self.process_group):
+            # In tensor-parallel configurations, each GPU processes a specific segment of the output.
+            # The 'result' tensor represents the full output, which can vary in size based on
+            # the layer type (e.g., attention vs. feed-forward layers). We define the current
+            # segment using start_idx and end_idx. If the segment size doesn't match this GPU's
+            # slice of 'result', we create a zero tensor of the correct size for LoRA computation.
+            # This approach ensures accurate LoRA application across various layer sizes and
+            # configurations, adapting to different model architectures and parallelization strategies.
+            #
+            # Example scenarios where this is necessary:
+            # 1. The adapter's size doesn't evenly divide across GPUs.
+            # 2. We're processing the last segment which might be smaller.
+            # 3. Different projection layers (q, k, v) have different sizes.
+            if end_idx - start_idx != result.shape[1]:
+                proj = torch.zeros_like(result[:, start_idx:end_idx])
+            else:
+                proj = result
+
+            for r, rank_segments in data.rank_data.items():
+                lora_a_ptr = rank_segments.lora_a_ptr
+                lora_b_ptr = rank_segments.lora_b_ptr
+
+                if lora_a_ptr is None or lora_b_ptr is None:
+                    raise ValueError("LoRA data is missing")
+
+                if data.use_sgmv:
+                    # Use SGMV for prefill
+                    v = lora_a_sgmv_cutlass(
+                        input,
+                        rank_segments.tmp_shrink,
+                        lora_a_ptr,
+                        rank_segments.segment_starts,
+                        rank_segments.segment_ends,
+                        self.layer_id,
+                        r,
+                    )
+
+                    if self.process_group.size() > 1:
+                        v = self.collect_lora_a(v)
+
+                    lora_b_sgmv_cutlass(
+                        proj,
+                        v,
+                        rank_segments.tmp_expand,
+                        lora_b_ptr,
+                        rank_segments.segment_starts,
+                        rank_segments.segment_ends,
+                        self.layer_id,
+                    )
+                else:
+                    # Use BGMV for decode
+                    v = torch.zeros(
+                        (input.size(0), r), dtype=input.dtype, device=input.device
+                    )
+                    # TODO: error with [-1, 0], but not [0, -1]
+                    add_lora_a_bgmv(
+                        v,
+                        input,
+                        lora_a_ptr,
+                        rank_segments.indices,
+                        self.layer_id,
+                    )
+
+                    if self.process_group.size() > 1:
+                        v = self.collect_lora_a(v)
+
+                    add_lora_b_bgmv(
+                        proj,
+                        v,
+                        lora_b_ptr,
+                        rank_segments.indices,
+                        self.layer_id,
+                    )
+
+            if end_idx - start_idx != result.shape[1]:
+                result[:, start_idx:end_idx] += proj
+        else:
+            for adapter_index in adapter_data.meta.adapter_set:
+                if data is not None and data.has_adapter(adapter_index):
+                    adapter_mask = (
+                        (adapter_data.meta.adapter_indices == adapter_index)
+                        .to(input.dtype)
+                        .view(-1, 1)
+                    )
+                    layer_result = self.forward_lora(
+                        input, data, adapter_index, adapter_mask
+                    )
+                    result[:, start_idx:end_idx] += layer_result
+
+        return result
+
+    def forward_lora(
+        self,
+        input: torch.Tensor,
+        data: "BatchLoraWeights",
+        adapter_index: int,
+        adapter_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        lora_a = data.lora_a[adapter_index][self.layer_id, :, :]
+        lora_b = data.lora_b[adapter_index][self.layer_id, :, :]
+
+        lora_a = orient_for_rank(lora_a, lora_b.size(0))
+
+        a_out = input @ lora_a
+        if self.process_group.size() > 1:
+            a_out = self.collect_lora_a(a_out)
+
+        result = (a_out @ lora_b) * adapter_mask
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError("Implemented in subclasses")
+
+
+class TensorParallelMultiAdapterLinear(LoraLinear):
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        layer_id: int,
+        layer_names: List[str],
+        sizes: List[int],
+        process_group: ProcessGroup,
+    ):
+        super().__init__(base_layer, layer_id, process_group)
+        self.layer_names = layer_names
+        self.sizes = sizes
+
+    @classmethod
+    def load(
+        cls,
+        base_layer: nn.Module,
+        layer_id: int,
+        layer_names: List[str],
+        sizes: List[int],
+        process_group: ProcessGroup,
+    ):
+        return TensorParallelMultiAdapterLinear(
+            base_layer, layer_id, layer_names, sizes, process_group
+        )
+
+    def forward(
+        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
+    ) -> torch.Tensor:
+        result = self.base_layer(input)
+
+        # noop if no layer names are provided (e.g. for models without adapters)
+        if self.layer_names is None:
+            return result
+
+        # handle models like Bloom that have inputs of shape
+        # (batch_size, sequence_length, hidden_size)
+        # we need to reshape them to (batch_size * sequence_length, hidden_size)
+        # for the LoRA computation, then reshape back
+        prev_shape = result.shape
+        is_3d = len(input.shape) >= 3
+        if is_3d:
+            input = input.reshape(-1, input.shape[-1])
+            result = result.reshape(-1, result.shape[-1])
+
+        offset = 0
+        for i, layer_name in enumerate(self.layer_names):
+            start_idx = offset // self.process_group.size()
+            # The 'sizes' parameter is essential in tensor-parallel setups for handling multiple
+            # projection layers (q_proj, k_proj, v_proj) by defining their output dimensions. It
+            # ensures correct slicing of the result tensor, accommodating variations like grouped-query
+            # attention where k_proj and v_proj differ from q_proj. This allows precise application of
+            # LoRA adapters to each sub-component of the multi-head attention mechanism, managing the
+            # different projection sizes across layers and model architectures.
+            if self.sizes is not None:
+                offset += self.sizes[i]
+                end_idx = offset // self.process_group.size()
+            else:
+                end_idx = result.shape[1]
+
+            result = self.forward_layer_type(
+                result, input, adapter_data, layer_name, start_idx, end_idx
+            )
+
+        if is_3d:
+            result = result.reshape(prev_shape)
+
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        # Tensor parallel implementation of X @ A@B, where A and B are sharded column-wise.
+        # We use an all-gather between X@A and (X@A)@B to ensure alignment across ranks.
+        #
+        # TODO(travis): this is not very efficient as we do an all-gather for every adapter,
+        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
+        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
+        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
+        gathered_tensors = [
+            torch.empty_like(a_out) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(gathered_tensors, a_out)
+        return torch.cat(gathered_tensors, dim=1)
+
+
+class TensorParallelAdapterRowLinear(LoraLinear):
+    def __init__(self, base_layer, layer_id, layer_name, process_group):
+        super().__init__(base_layer, layer_id, process_group)
+        self.layer_name = layer_name
+
+    @classmethod
+    def load(cls, base_layer, layer_id, layer_name, process_group):
+        return cls(base_layer, layer_id, layer_name, process_group)
+
+    def forward(
+        self, input: torch.Tensor, adapter_data: "AdapterBatchData"
+    ) -> torch.Tensor:
+        result = self.base_layer(input)
+
+        if self.layer_name is None:
+            return result
+
+        # Fused all-gather + all-reduce from S-LoRA paper: https://arxiv.org/abs/2311.03285
+        stride = result.shape[-1] // self.process_group.size()
+        start_idx = self.process_group.rank() * stride
+        end_idx = (self.process_group.rank() + 1) * stride
+
+        self.forward_layer_type(
+            result, input, adapter_data, self.layer_name, start_idx, end_idx
+        )
+
+        return result
+
+    def collect_lora_a(self, a_out: torch.Tensor) -> torch.Tensor:
+        # Tensor parallel implementation of X @ A@B, where A and B are sharded row-wise.
+        # We use an all-reduce between X@A and (X@A)@B to ensure alignment across ranks.
+        #
+        # TODO(travis): this is not very efficient as we do an all-reduce for every adapter,
+        #   instead we could pre-allocate a (B, a, r) tensor for all adapters with the same
+        #   rank, compute `a_out` on each, and then slice them into the buffer as shown here:
+        #   https://discuss.pytorch.org/t/concatenate-tensors-without-memory-copying/34609
+        torch.distributed.all_reduce(a_out, group=self.process_group)
+        return a_out
diff --git a/server/text_generation_server/layers/marlin.py b/server/text_generation_server/layers/marlin.py
new file mode 100644
index 00000000..a1af67a3
--- /dev/null
+++ b/server/text_generation_server/layers/marlin.py
@@ -0,0 +1,410 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from text_generation_server.layers.gptq import GPTQParams
+from text_generation_server.utils.import_utils import SYSTEM
+
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+try:
+    major, _minor = torch.cuda.get_device_capability()
+    has_sm_8_0 = major >= 8
+except Exception:
+    has_sm_8_0 = False
+
+
+GPTQ_MARLIN_BITS = [4, 8]
+GPTQ_MARLIN_GROUP_SIZES = [-1, 32, 64, 128]
+MARLIN_TILE_SIZE = 16
+
+
+def can_use_gptq_marlin(gptq_params: GPTQParams, quantize: str) -> bool:
+    return (
+        SYSTEM == "cuda"
+        and marlin_kernels is not None
+        and has_sm_8_0
+        and quantize == "gptq"
+        and gptq_params.quant_method == "gptq"
+        and gptq_params.bits in GPTQ_MARLIN_BITS
+        and gptq_params.groupsize in GPTQ_MARLIN_GROUP_SIZES
+        and gptq_params.sym
+    )
+
+
+def _check_marlin_kernels():
+    if not (SYSTEM == "cuda" and has_sm_8_0):
+        raise NotImplementedError(
+            "Using quantized Marlin models requires a GPU with CUDA capability 8.0 or later."
+        )
+
+    if marlin_kernels is None:
+        raise NotImplementedError(
+            "marlin is not installed, install it with: pip install server/marlin"
+        )
+
+
+def _check_valid_shape(in_features: int, out_features: int):
+    if (in_features % 128 != 0 or out_features % 64 != 0) and (
+        in_features % 64 != 0 or out_features % 128 != 0
+    ):
+        raise ValueError(
+            f"The GPTQ Marlin kernel does not have a valid thread configuration for weight matrix with shape ({out_features}, {in_features})."
+            " The shape elements must be divisible by (128, 64) or (64, 128)."
+        )
+
+
+# https://github.com/IST-DASLab/marlin/blob/2f6d7c10e124b3c5fa29ff8d77d568bd7af3274c/marlin/__init__.py#L40C1-L68C54
+def _get_perms() -> Tuple[List[int], List[int]]:
+    scale_perm = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+_scale_perm, _scale_perm_single = _get_perms()
+
+
+def permute_scales(scales: torch.Tensor):
+    out_features = scales.shape[1]
+    if scales.shape[0] == 1:
+        scales = scales.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
+    else:
+        scales = scales.reshape((-1, len(_scale_perm)))[:, _scale_perm]
+    return scales.reshape((-1, out_features)).contiguous()
+
+
+@dataclass
+class GPTQMarlinWeight:
+    """
+    Repacked GPTQ Marlin weights.
+    """
+
+    qweight: torch.Tensor
+    scales: torch.Tensor
+    g_idx: torch.Tensor
+    perm: torch.Tensor
+    bits: int
+    is_full_k: bool
+
+    def __post_init__(self):
+        assert self.qweight.dtype == torch.int32
+        assert self.scales.dtype == torch.float16
+        assert self.g_idx.dtype == torch.int32
+        assert self.perm.dtype == torch.int32
+
+
+def repack_gptq_for_marlin(
+    *,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    g_idx: torch.Tensor,
+    bits: int,
+    desc_act: bool,
+    groupsize: int,
+    sym: bool,
+    sharded_infeatures: bool,
+) -> GPTQMarlinWeight:
+    """Convert GPTQ weights to a layout that's compatible with GPTQ-Marlin kernels."""
+    _check_marlin_kernels()
+    assert marlin_kernels is not None
+
+    if bits not in GPTQ_MARLIN_BITS:
+        supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
+        raise RuntimeError(
+            f"Repacking {bits}-bit GPTQ weights as Marlin is not supported, must be one of: {supported_bits}"
+        )
+
+    if groupsize not in GPTQ_MARLIN_GROUP_SIZES:
+        supported_sizes = ", ".join(str(b) for b in GPTQ_MARLIN_GROUP_SIZES)
+        raise RuntimeError(
+            f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
+        )
+    if not sym:
+        raise RuntimeError(
+            "Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
+        )
+
+    weights_per_int = 32 // bits
+    in_features = qweight.shape[0] * weights_per_int
+    out_features = qweight.shape[1]
+
+    if in_features % groupsize != 0:
+        raise ValueError(
+            f"Number of input features ({in_features}) not divisible by group size ({groupsize})"
+        )
+
+    if desc_act and groupsize != -1:
+        perm = torch.argsort(g_idx).to(torch.int)
+        g_idx = g_idx[perm]
+    else:
+        perm = torch.empty(0, dtype=torch.int, device=qweight.device)
+        g_idx = torch.empty(0, dtype=torch.int, device=qweight.device)
+
+    repacked = marlin_kernels.gptq_marlin_repack(
+        qweight, perm, in_features, out_features, bits
+    )
+
+    scales = permute_scales(scales)
+
+    is_full_k = not (desc_act and sharded_infeatures)
+
+    return GPTQMarlinWeight(
+        qweight=repacked,
+        scales=scales,
+        g_idx=g_idx,
+        perm=perm,
+        bits=bits,
+        is_full_k=is_full_k,
+    )
+
+
+class GPTQMarlinLinear(nn.Module):
+    """
+    Linear layer for GPTQ weights that were converted for the GPTQ-Marlin
+    kernels.
+    """
+
+    def __init__(
+        self,
+        *,
+        weight: GPTQMarlinWeight,
+        bias: Optional[torch.Tensor],
+    ):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        in_features = weight.qweight.shape[0] * MARLIN_TILE_SIZE
+        out_features = weight.scales.shape[1]
+        _check_valid_shape(in_features=in_features, out_features=out_features)
+
+        self.bits = weight.bits
+        self.is_full_k = weight.is_full_k
+
+        self.qweight = weight.qweight
+        self.scales = weight.scales
+        self.g_idx = weight.g_idx
+        self.perm = weight.perm
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=weight.qweight.device
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        A_flat = A.view(-1, A.shape[-1])
+        C = marlin_kernels.gptq_marlin_gemm(
+            A_flat,
+            self.qweight,
+            self.scales,
+            self.g_idx,
+            self.perm,
+            self.workspace,
+            self.bits,
+            A_flat.shape[0],
+            self.scales.shape[1],
+            A_flat.shape[1],
+            self.is_full_k,
+        )
+        C = C.reshape(A.shape[:-1] + (self.scales.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+GPTQ_MARLIN_24_MIN_THREAD_N = 128
+GPTQ_MARLIN_24_MIN_THREAD_K = 128
+GPTQ_MARLIN_24_MAX_PARALLEL = 64
+GPTQ_MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
+GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
+
+
+@dataclass
+class GPTQMarlin24Weight:
+    """
+    GPTQ-Marlin 2:4 weights.
+
+    Attributes:
+        B (torch.Tensor): int4-quantized weights packed into int32.
+        B_meta (torch.Tensor): metadata for 2:4 sparsity.
+        s (torch.Tensor): float16 scales.
+        bits: quantized weight size.
+    """
+
+    B: torch.Tensor
+    B_meta: torch.Tensor
+    s: torch.Tensor
+    bits: int
+
+    def __post_init__(self):
+        assert self.B.dtype == torch.int32
+        assert self.B_meta.dtype == torch.int16
+        assert self.s.dtype == torch.float16
+
+
+class GPTQMarlin24Linear(nn.Module):
+    def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        if weight.bits not in GPTQ_MARLIN_BITS:
+            supported_bits = ", ".join(str(b) for b in GPTQ_MARLIN_BITS)
+            raise RuntimeError(
+                f"{weight.bits}-bit GPTQ Sparse 2:4 Marlin is not supported, must be one of: {supported_bits}"
+            )
+
+        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE * 2
+        out_features = weight.s.shape[1]
+        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
+
+        if groupsize not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
+            supported_sizes = ", ".join(
+                str(b) for b in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
+            )
+            raise RuntimeError(
+                f"Group size {groupsize} is not supported, must be one of: {supported_sizes}"
+            )
+
+        self.bits = weight.bits
+        weights_per_int32 = 32 // self.bits
+
+        assert (
+            out_features % GPTQ_MARLIN_24_MIN_THREAD_N == 0
+        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_N} threads"
+        assert (
+            out_features % weights_per_int32 == 0
+        ), f"Number of output features ({out_features}) not divisable by weights per int32 ({weights_per_int32})"
+
+        assert (
+            in_features % GPTQ_MARLIN_24_MIN_THREAD_K == 0
+        ), f"Number of output features ({out_features}) not divisable by {GPTQ_MARLIN_24_MIN_THREAD_K} threads"
+        if groupsize != -1 and in_features % groupsize != 0:
+            raise ValueError(
+                f"Number of input features ({in_features}) not divisable by group size ({groupsize})"
+            )
+
+        self.B = weight.B
+        self.B_meta = weight.B_meta
+        self.s = weight.s
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            (out_features // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL,
+            dtype=torch.int,
+            device=weight.B.device,
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        C = marlin_kernels.gptq_marlin_24_gemm(
+            A.view(-1, A.shape[-1]),
+            self.B,
+            self.B_meta,
+            self.s,
+            self.workspace,
+            self.bits,
+            A.shape[0],
+            self.s.shape[1],
+            A.shape[1],
+        )
+
+        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
+
+
+@dataclass
+class MarlinWeight:
+    """
+    Marlin weights.
+
+    Attributes:
+        B (torch.Tensor): int4-quantized weights packed into int32.
+        s (torch.Tensor): float16 scales.
+    """
+
+    B: torch.Tensor
+    s: torch.Tensor
+
+    def __post_init__(self):
+        assert self.B.dtype == torch.int32
+        assert self.s.dtype == torch.float16
+
+
+class MarlinLinear(nn.Module):
+    def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]):
+        super().__init__()
+
+        _check_marlin_kernels()
+        assert marlin_kernels is not None
+
+        in_features = weight.B.shape[0] * MARLIN_TILE_SIZE
+        out_features = weight.s.shape[1]
+        assert (
+            in_features % 128 == 0
+        ), f"Number of input features ({in_features}) not divisable by 128"
+        assert (
+            out_features % 256 == 0
+        ), f"Number of output features ({out_features}) not divisable by 256"
+
+        groupsize = -1 if weight.s.shape[0] == 1 else in_features // weight.s.shape[0]
+        assert groupsize in {
+            -1,
+            128,
+        }, f"Group size must be -1 or 128, was {groupsize}"
+
+        self.B = weight.B
+        self.s = weight.s
+        if bias is not None:
+            self.bias = bias
+        else:
+            self.bias = None
+
+        self.workspace = torch.zeros(
+            out_features // 64 * 16, dtype=torch.int, device=weight.B.device
+        )
+
+    def forward(self, A: torch.Tensor) -> torch.Tensor:
+        assert marlin_kernels is not None
+
+        C = marlin_kernels.marlin_gemm(
+            A.view(-1, A.shape[-1]),
+            self.B,
+            self.s,
+            self.workspace,
+            A.shape[0],
+            self.s.shape[1],
+            A.shape[1],
+        )
+        C = C.reshape(A.shape[:-1] + (self.s.shape[1],))
+
+        if self.bias is not None:
+            C += self.bias
+
+        return C
diff --git a/server/text_generation_server/layers/medusa.py b/server/text_generation_server/layers/medusa.py
new file mode 100644
index 00000000..7579ccdb
--- /dev/null
+++ b/server/text_generation_server/layers/medusa.py
@@ -0,0 +1,189 @@
+import torch
+from torch import nn
+from typing import Tuple, Optional
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.layers.linear import FastLinear
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelHead,
+    TensorParallelColumnLinear,
+)
+
+
+class ResBlock(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.linear = FastLinear.load(
+            config, prefix=f"{prefix}.linear", weights=weights, bias=True
+        )
+        self.act = torch.nn.SiLU()
+
+    def forward(self, x):
+        return x + self.act(self.linear(x))
+
+
+class MedusaModel(torch.nn.Module):
+    def __init__(self, config, medusa_config, weights):
+        super().__init__()
+        self.heads = torch.nn.ModuleList(
+            [
+                MedusaHead(config, medusa_config, prefix=f"{i}", weights=weights)
+                for i in range(get_speculate())
+            ]
+        )
+
+    def forward(self, x):
+        speculative_logits = torch.stack([head(x) for head in self.heads], dim=1)
+        return speculative_logits
+
+
+class MedusaHead(torch.nn.Module):
+    def __init__(self, config, medusa_config, prefix, weights):
+        super().__init__()
+        self.blocks = torch.nn.ModuleList(
+            [
+                ResBlock(config, prefix=f"{prefix}.{i}", weights=weights)
+                for i in range(medusa_config["medusa_num_layers"])
+            ]
+        )
+        n = len(self.blocks)
+        self.out = FastLinear.load(
+            config, prefix=f"{prefix}.{n}", weights=weights, bias=False
+        )
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.out(x)
+        return x
+
+
+class MedusaHeadV1(nn.Module):
+    def __init__(self, lm_head, medusa):
+        super().__init__()
+        self.lm_head = lm_head
+        self.medusa = medusa
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        speculator = config.speculator
+
+        path = speculator["path"]
+        medusa_config = str(Path(path) / "config.json")
+
+        for fname in speculator["model_paths"]:
+            filename = str(Path(path) / fname)
+
+            with open(medusa_config, "r") as f:
+                medusa_config = json.load(f)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        medusa = MedusaModel(config, medusa_config, weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MedusaHeadV1(lm_head, medusa)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        speculative_logits = self.medusa(input)
+        return logits, speculative_logits
+
+
+class MedusaHeadV2(nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        from pathlib import Path
+        from safetensors import safe_open
+        import json
+
+        speculator_path = config.speculator["path"]
+
+        medusa_config = str(Path(speculator_path) / "config.json")
+        filename = str(Path(speculator_path) / "medusa_lm_head.safetensors")
+
+        with open(medusa_config, "r") as f:
+            medusa_config = json.load(f)
+        routing = weights.routing
+        with safe_open(filename, framework="pytorch") as f:
+            for k in f.keys():
+                if k in routing and routing[k] != filename:
+                    raise RuntimeError(
+                        f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                    )
+                routing[k] = filename
+
+        self.n_medusa_heads = get_speculate()
+
+        assert medusa_config["medusa_num_layers"] == 1
+        self.linear = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{i}.0.linear" for i in range(self.n_medusa_heads)],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.process_group = weights.process_group
+        self.world_size = self.process_group.size()
+        self.rank = self.process_group.rank()
+
+        self.act = torch.nn.SiLU()
+
+        self.lm_head = TensorParallelHead.load(config, prefix, weights)
+
+    def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
+        size = x.shape[-1]
+        block_size = (size + self.world_size - 1) // self.world_size
+        start = self.rank * block_size
+        stop = (self.rank + 1) * block_size
+
+        x_block = x[:, start:stop]
+
+        # Compute all medusa heads at the same time, then reshape and move the n_medusa_heads dim to dim 1
+        medusa_res = self.act(self.linear(x)).reshape(
+            *x_block.shape[:-1], self.n_medusa_heads, x_block.shape[-1]
+        )
+
+        # Apply all residual medusa heads
+        output = x[:, start:stop].unsqueeze(-2) + medusa_res
+
+        # Gather medusa heads
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+
+        # Stack x and medusa residual x
+        stacked_x = torch.cat([x.unsqueeze(-2), world_output], dim=-2)
+
+        # Compute lm head on x + medusa residual x
+        logits = self.lm_head(stacked_x)
+
+        # Finally, split logits from speculative logits
+        logits, speculative_logits = torch.split(
+            logits, [1, self.n_medusa_heads], dim=-2
+        )
+        # Squeeze added dimension
+        logits = logits.squeeze(-2)
+
+        return logits, speculative_logits
diff --git a/server/text_generation_server/layers/mlp.py b/server/text_generation_server/layers/mlp.py
new file mode 100644
index 00000000..f08cb673
--- /dev/null
+++ b/server/text_generation_server/layers/mlp.py
@@ -0,0 +1,176 @@
+import torch
+import math
+from torch import nn
+from torch.nn import functional as F
+from typing import Optional, Tuple
+from text_generation_server.layers import TensorParallelEmbedding, FastLinear
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.utils.speculate import get_speculate
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    elementwise_scale_weight : torch.Tensor
+        learned scaling term after normalization?
+    elementwise_shift_bias : torch.Tensor
+        learned bias term after normalization?
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        prefix,
+        config,
+        weights,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = weights.get_tensor(f"{prefix}.weight")
+        self.bias = weights.get_tensor(f"{prefix}.bias")
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+
+class MLPSpeculatorModel(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.config = config
+        self.n_predict = get_speculate()
+        self.hidden_size = config.hidden_size
+        self.emb = nn.ModuleList(
+            [
+                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.proj = [
+            FastLinear.load(
+                config,
+                prefix=f"{prefix}.proj.{i}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_predict)
+        ]
+        self.head = nn.ModuleList(
+            [
+                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.ln = nn.ModuleList(
+            [
+                MLPSpeculatorLayerNorm(
+                    prefix=f"{prefix}.ln.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(self.n_predict)
+            ]
+        )
+
+        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
+        self.state_weight = 0.5 ** (0.5 / self.n_predict)
+        self.emb_weight = math.sqrt(1 - self.state_weight**2)
+        self.activation = nn.GELU()
+        # TODO
+        self.vsize = config.vocab_size
+        self.inner_dim = config.speculator_config["inner_dim"]
+        self.top_k_tokens_per_head = [1] * self.n_predict
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        top_k_tokens_per_head = self.top_k_tokens_per_head
+
+        # k indicates # of candidates
+        # h indicates # of generated tokens
+        state = hidden_states
+        b = state.size(0)
+        ind = input_ids.unsqueeze(0)
+        all_probs = torch.empty(
+            b, self.n_predict, self.vsize, device=state.device
+        )  # b k h v
+        assert (
+            len(top_k_tokens_per_head) == self.n_predict
+        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
+        for i in range(self.n_predict):
+            # Project and predict
+            z = self.emb[i](ind)
+            z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2))  # b k d
+            state = self.proj[i](state) * self.state_weight + z
+            state = self.activation(self.ln[i](state))  # b k d
+            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
+            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
+
+            # Update candidate set with new predictions
+
+            # Update distribution set with new logits
+            all_probs[:, i] = probs.exp()
+
+            # Update state, log_probs and ind for new predictions
+            state = state.unsqueeze(2).expand(
+                -1, -1, top_k_tokens_per_head[i], -1
+            )  # b k k' d
+            state = state.reshape(-1, b, state.size(3))  # b kk' d
+            ind = preds.view(-1, b)  # b kk'
+
+        speculative_logits = all_probs
+        return speculative_logits
+
+
+class MLPSpeculatorHead(nn.Module):
+    def __init__(self, lm_head, mlp_speculator):
+        super().__init__()
+        self.lm_head = lm_head
+        self.mlp_speculator = mlp_speculator
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        input_ids = logits.argmax(dim=-1)
+        speculative_logits = self.mlp_speculator(input, input_ids)
+        return logits, speculative_logits
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+
+        speculator_path = config.speculator["path"]
+
+        for fname in config.speculator["model_paths"]:
+            filename = str(Path(speculator_path) / fname)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MLPSpeculatorHead(lm_head, mlp_speculator)
diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py
new file mode 100644
index 00000000..87a61e82
--- /dev/null
+++ b/server/text_generation_server/layers/rotary.py
@@ -0,0 +1,423 @@
+import os
+import torch
+from torch import nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "cuda":
+    from flash_attn.layers.rotary import RotaryEmbedding
+    import rotary_emb
+elif SYSTEM == "rocm":
+    from vllm._C import ops
+elif SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+
+def _create_inv_freq(dim, base, device):
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+    )
+    return inv_freq
+
+
+def _get_rope_config(config):
+    if os.getenv("ROPE_SCALING", None) is not None:
+        rope_scaling = {
+            "type": os.environ["ROPE_SCALING"],
+            "factor": float(os.environ["ROPE_FACTOR"]),
+        }
+        return rope_scaling
+    return getattr(config, "rope_scaling", None)
+
+
+class PositionRotaryEmbedding(nn.Module):
+    def __init__(self, inv_freq, scaling_factor):
+        super().__init__()
+        self.inv_freq = inv_freq
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.scaling_factor = scaling_factor
+        self.dynamic_args = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            rotary_dim = cos.shape[-1]
+            q1 = query[..., :rotary_dim]
+            q2 = query[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., :rotary_dim]
+            k2 = key[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, True)
+        elif SYSTEM == "ipex":
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), True
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+    @classmethod
+    def static(cls, config, dim, base, device):
+        inv_freq = _create_inv_freq(dim, base, device)
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                scaling_factor = rope_scaling["factor"]
+                return DynamicPositionRotaryEmbedding(
+                    dim=dim,
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=base,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                scaling_factor = rope_scaling["factor"]
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            elif rope_scaling["type"] in ["su", "longrope"]:
+                short_factor = torch.tensor(
+                    rope_scaling["short_factor"], dtype=torch.float32, device=device
+                )
+                short_inv_freq = 1.0 / (
+                    short_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+                long_factor = torch.tensor(
+                    rope_scaling["long_factor"], dtype=torch.float32, device=device
+                )
+                long_inv_freq = 1.0 / (
+                    long_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+
+                original_max_position_embeddings = (
+                    config.original_max_position_embeddings
+                )
+                max_position_embeddings = config.max_position_embeddings
+                if max_position_embeddings <= original_max_position_embeddings:
+                    scaling_factor = 1.0
+                else:
+                    scale = max_position_embeddings / original_max_position_embeddings
+                    scaling_factor = math.sqrt(
+                        1 + math.log(scale) / math.log(original_max_position_embeddings)
+                    )
+
+                return SuRotaryEmbedding(
+                    short_inv_freq=short_inv_freq,
+                    long_inv_freq=long_inv_freq,
+                    scaling_factor=scaling_factor,
+                    original_max_position_embeddings=original_max_position_embeddings,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    @classmethod
+    def load(cls, config, prefix, weights):
+        # XXX: Always load this in float32 !
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
+        weights.dtype = dtype
+
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            scaling_factor = rope_scaling["factor"]
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                return DynamicPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
+        """
+        Return cos and sin for the asked position ids
+        """
+        if SYSTEM == "rocm":
+            # For RoCm, we always use float cos/sin to avoid a cast.
+            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
+            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
+            dtype = torch.float32
+
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+
+        cos = torch.index_select(self._cos_cached, 0, position_ids)
+        sin = torch.index_select(self._sin_cached, 0, position_ids)
+
+        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
+        return cos.unsqueeze(1), sin.unsqueeze(1)
+
+
+class SuRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq,
+        long_inv_freq,
+        scaling_factor,
+        original_max_position_embeddings,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+
+            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
+            short_freqs = torch.outer(
+                t[: self.original_max_position_embeddings],
+                self.short_inv_freq.to(device=t.device),
+            )
+            long_freqs = torch.outer(
+                t[self.original_max_position_embeddings :],
+                self.long_inv_freq.to(device=t.device),
+            )
+
+            freqs = torch.cat([short_freqs, long_freqs])
+
+            self._cos_cached = (torch.cos(freqs) * self.scaling_factor).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype)
+
+
+class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                newbase = self.base * (
+                    (self.scaling_factor * seqlen / self.max_position_embeddings)
+                    - (self.scaling_factor - 1)
+                ) ** (self.dim / (self.dim - 2))
+                self.inv_freq = _create_inv_freq(
+                    self.dim, newbase, self.inv_freq.device
+                )
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+# Inverse dim formula to find dim based on number of rotations
+import math
+
+
+def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_mscale(scale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings,
+        base,
+        device,
+        scaling_factor,
+        *,
+        extrapolation_factor,
+        attn_factor,
+        beta_fast,
+        beta_slow,
+    ):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = float(
+            get_mscale(self.scaling_factor) * self.attn_factor
+        )  # Get n-d magnitude scaling corrected for interpolation
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                inv_freq_extrapolation = _create_inv_freq(
+                    self.dim, self.base, self.inv_freq.device
+                )
+                freqs = 1.0 / inv_freq_extrapolation
+                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
+                low, high = find_correction_range(
+                    self.beta_fast,
+                    self.beta_slow,
+                    self.dim,
+                    self.base,
+                    self.max_position_embeddings,
+                )
+                inv_freq_mask = (
+                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
+                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_mask)
+                    + inv_freq_extrapolation * inv_freq_mask
+                )
+
+                self.inv_freq = inv_freq
+                self.mscale = float(
+                    get_mscale(self.scaling_factor) * self.attn_factor
+                )  # Get n-d magnitude scaling corrected for interpolation
+
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
diff --git a/server/text_generation_server/layers/speculative.py b/server/text_generation_server/layers/speculative.py
new file mode 100644
index 00000000..4b977a56
--- /dev/null
+++ b/server/text_generation_server/layers/speculative.py
@@ -0,0 +1,52 @@
+import torch
+import json
+from typing import Tuple, Optional
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
+from text_generation_server.layers.mlp import MLPSpeculatorHead
+
+
+class SpeculativeHead(torch.nn.Module):
+    def __init__(self, lm_head, speculator):
+        super().__init__()
+        self.head = lm_head
+        self.speculator = speculator
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        speculator = config.speculator
+        if speculator:
+            speculator_path = config.speculator["path"]
+            speculator_config = str(speculator_path / "config.json")
+
+            with open(speculator_config, "r") as f:
+                speculator_config = json.load(f)
+
+            config.speculator_config = speculator_config
+            try:
+                architecture = speculator_config["architectures"][0]
+
+                if architecture == "MLPSpeculatorPreTrainedModel":
+                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
+                else:
+                    speculator = None
+            except KeyError:
+                try:
+                    speculator = MedusaHeadV1.load(config, prefix, weights)
+                except:
+                    speculator = MedusaHeadV2(config, prefix, weights)
+            lm_head = None
+        else:
+            lm_head = TensorParallelHead.load(config, prefix, weights)
+            speculator = None
+        return SpeculativeHead(lm_head, speculator)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.speculator is not None:
+            return self.speculator(input)
+
+        assert self.head is not None
+        logits = self.head(input)
+        return logits, None
diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py
new file mode 100644
index 00000000..038de258
--- /dev/null
+++ b/server/text_generation_server/layers/tensor_parallel.py
@@ -0,0 +1,264 @@
+import torch
+from torch.nn import functional as F
+from typing import Iterable, List
+from text_generation_server.layers.linear import get_linear, FastLinear
+from text_generation_server.layers.exl2 import Exl2Weight
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+
+class LayerConcat(torch.nn.Module):
+    """
+    Apply multiple layers to the input and concatenate their
+    outputs.
+    """
+
+    def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
+        """
+        `dim` is the dimension along which layer outputs are concatenated.
+        """
+        super().__init__()
+        self.layers = layers
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor):
+        outputs = [layer(x) for layer in self.layers]
+        return torch.cat(outputs, self.dim)
+
+
+class SuperLayer(torch.nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+
+    def forward(self, x):
+        return self.linear.forward(x)
+
+
+class TensorParallelHead(SuperLayer):
+    def __init__(self, linear, process_group, should_gather: bool):
+        super().__init__(linear)
+        self.process_group = process_group
+        self.should_gather = should_gather
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        if config.quantize == "exl2":
+            try:
+                # If the piece and LM head embeddings are shared, we have
+                # non-quantized weights...
+                weight = weights.get_tensor(f"{prefix}.weight")
+            except:
+                # ...otherwise they are quantized.
+                weight = weights.get_weights_col(prefix, config.quantize)
+            should_gather = weights.process_group.size() > 1
+        elif weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
+            weight = weights.get_tensor(f"{prefix}.weight")
+            should_gather = False
+
+        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
+        if config.quantize in ["gptq", "awq", "eetq", "marlin"]:
+            quantize = None
+        # See above, exl2 LM head can be quantized or not.
+        elif config.quantize == "exl2" and not isinstance(weight, Exl2Weight):
+            quantize = None
+        else:
+            quantize = config.quantize
+
+        return TensorParallelHead(
+            get_linear(weight, bias=None, quantize=quantize),
+            process_group=weights.process_group,
+            should_gather=should_gather,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.should_gather:
+            return super().forward(input)
+
+        world_size = self.process_group.size()
+        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
+            out_dim = self.linear.weight.shape[0]
+
+            if input.shape[0] == 1:
+                world_out = input.new_empty(1, out_dim * world_size)
+                local_out = input.new_empty(1, out_dim)
+                gather_input = local_out
+            else:
+                world_out = input.new_empty(out_dim * world_size, input.shape[0])
+                gather_input = input.new_empty(out_dim, input.shape[0])
+                local_out = gather_input.T
+
+            torch.mm(input, self.linear.weight.T, out=local_out)
+            if SYSTEM == "ipex":
+                ipex.distributed.all_gather_into_tensor(
+                    world_out, gather_input, group=self.process_group
+                )
+            else:
+                torch.distributed.all_gather_into_tensor(
+                    world_out, gather_input, group=self.process_group
+                )
+
+            if input.shape[0] == 1:
+                return world_out
+            return world_out.T
+
+        output = super().forward(input)
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        if SYSTEM == "ipex":
+            ipex.distributed.all_gather(world_output, output, group=self.process_group)
+        else:
+            torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+        return world_output
+
+
+class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_gate_up(
+            prefix, quantize=config.quantize
+        )
+        if bias:
+            raise NotImplementedError("packed_gate_up only implemented without bias")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load_qkv(
+        cls,
+        config,
+        prefix: str,
+        weights,
+        bias: bool,
+        num_heads: int,
+        num_key_value_heads: int,
+    ):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_qkv(
+            prefix,
+            quantize=config.quantize,
+            num_heads=num_heads,
+            num_key_value_heads=num_key_value_heads,
+        )
+        if bias:
+            raise NotImplementedError("packed_qkv only implemented for baichuan")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_weights_col(prefix, config.quantize)
+        if bias:
+            bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
+        if config.quantize == "exl2":
+            linears = []
+            for prefix in prefixes:
+                weight = weights.get_weights_col(prefix, config.quantize)
+                b = weights.get_tensor(f"{prefix}.bias") if bias else None
+                linears.append(get_linear(weight, b, config.quantize))
+            linear = LayerConcat(linears)
+        else:
+            weight = weights.get_multi_weights_col(
+                prefixes, quantize=config.quantize, dim=dim
+            )
+            if bias:
+                b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
+                bias = torch.cat(b, dim=dim)
+            else:
+                bias = None
+            linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+
+class TensorParallelRowLinear(SuperLayer):
+    def __init__(self, linear, process_group):
+        super().__init__(linear)
+        self.process_group = process_group
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+
+        if bias and weights.process_group.rank() == 0:
+            # Rank is only on the first rank process
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(
+            get_linear(weight, bias, config.quantize),
+            process_group=weights.process_group,
+        )
+
+    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
+        out = super().forward(input)
+        if self.process_group.size() > 1 and reduce:
+            if SYSTEM == "ipex":
+                ipex.distributed.all_reduce(out, group=self.process_group)
+            else:
+                torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+
+
+class TensorParallelEmbedding(torch.nn.Module):
+    def __init__(self, prefix: str, weights, reduce=True):
+        super().__init__()
+        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
+        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
+
+        process_group = weights.process_group
+
+        world_size = process_group.size()
+        rank = process_group.rank()
+
+        block_size = (num_embeddings + world_size - 1) // world_size
+        self.min_id = rank * block_size
+        self.max_id = min(num_embeddings, (rank + 1) * block_size)
+        self.null_idx = weight.shape[
+            0
+        ]  # Usually block_size, might be less in non even vocab_size.
+        self.process_group = weights.process_group
+        self.reduce = reduce
+
+        """Additional 0 entry used for masking"""
+        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = torch.nn.functional.embedding(input, self.weight)
+        if self.reduce and self.process_group.size() > 1:
+            if SYSTEM == "ipex":
+                ipex.distributed.all_reduce(out, group=self.process_group)
+            else:
+                torch.distributed.all_reduce(out, group=self.process_group)
+        return out
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 932ab32e..58131a3a 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -1,23 +1,39 @@
-import os
 import torch
+import enum
+import os
 
 from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import modeling_auto
-from typing import Optional
+from huggingface_hub import hf_hub_download, HfApi
+from typing import Optional, List
+from pathlib import Path
 
+from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
-from text_generation_server.models.causal_lm import CausalLM
-from text_generation_server.models.flash_causal_lm import FlashCausalLM
-from text_generation_server.models.bloom import BLOOMSharded
-from text_generation_server.models.mpt import MPTSharded
+from text_generation_server.models.causal_lm import CausalLM, CausalLMBatchKeysLast
+from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
+from text_generation_server.models.custom_modeling.mpt_modeling import (
+    MPTForCausalLM,
+)
+from text_generation_server.models.bloom import BloomCausalLMBatch
+from text_generation_server.models.custom_modeling.bloom_modeling import (
+    BloomForCausalLM,
+)
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
-from text_generation_server.models.rw import RW
-from text_generation_server.models.opt import OPTSharded
-from text_generation_server.models.galactica import GalacticaSharded
-from text_generation_server.models.santacoder import SantaCoder
-from text_generation_server.models.t5 import T5Sharded
-from text_generation_server.models.gpt_neox import GPTNeoxSharded
+from text_generation_server.models.galactica import GalacticaCausalLMBatch
+from text_generation_server.models.custom_modeling.neox_modeling import (
+    GPTNeoxForCausalLM,
+)
+from text_generation_server.models.custom_modeling.phi_modeling import (
+    PhiConfig,
+    PhiForCausalLM,
+)
+from text_generation_server.models.custom_modeling.t5_modeling import (
+    T5ForConditionalGeneration,
+)
+
+from text_generation_server.utils.import_utils import SYSTEM
 
 # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
 # in PyTorch 1.12 and later.
@@ -33,51 +49,263 @@ __all__ = [
     "Model",
     "BLOOMSharded",
     "CausalLM",
-    "FlashCausalLM",
     "GalacticaSharded",
     "Seq2SeqLM",
-    "SantaCoder",
-    "OPTSharded",
-    "T5Sharded",
     "get_model",
 ]
 
 FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."
 
 FLASH_ATTENTION = True
+
 try:
-    from text_generation_server.models.flash_rw import FlashRWSharded
-    from text_generation_server.models.flash_neox import FlashNeoXSharded
-    from text_generation_server.models.flash_llama import (
-        FlashLlama,
+    from text_generation_server.models.flash_causal_lm import FlashCausalLM
+    from text_generation_server.models.vlm_causal_lm import VlmCausalLM
+    from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+        FlashLlamaForCausalLM,
     )
-    from text_generation_server.models.flash_santacoder import (
-        FlashSantacoderSharded,
+    from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
+        FlashCohereForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+        FlashGemmaForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
+        FlashGemma2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
+        FlashDbrxForCausalLM,
+        DbrxConfig,
+    )
+    from text_generation_server.models.custom_modeling.flash_rw_modeling import (
+        RWConfig,
+        FlashRWForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_neox_modeling import (
+        FlashGPTNeoXForCausalLM,
+    )
+    from text_generation_server.models.pali_gemma import (
+        PaliGemmaBatch,
+    )
+    from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
+        PaliGemmaForConditionalGeneration,
+    )
+    from text_generation_server.models.custom_modeling.flash_phi_modeling import (
+        FlashPhiForCausalLM,
     )
     from text_generation_server.models.idefics import IDEFICSSharded
+    from text_generation_server.models.custom_modeling.llava_next import (
+        LlavaNextForConditionalGeneration,
+    )
 
+    from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
+        FlashSantacoderForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_starcoder2_modeling import (
+        FlashStarcoder2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
+        Qwen2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
+        FlashMistralForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_mixtral_modeling import (
+        FlashMixtralForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.flash_gpt2_modeling import (
+        FlashGPT2ForCausalLM,
+    )
+    from text_generation_server.models.custom_modeling.idefics2 import (
+        Idefics2ForConditionalGeneration,
+    )
+    from text_generation_server.layers.attention import SUPPORTS_WINDOWING
 except ImportError as e:
     logger.warning(f"Could not import Flash Attention enabled models: {e}")
+    SUPPORTS_WINDOWING = False
     FLASH_ATTENTION = False
 
 if FLASH_ATTENTION:
-    __all__.append(FlashNeoXSharded)
-    __all__.append(FlashRWSharded)
-    __all__.append(FlashSantacoderSharded)
-    __all__.append(FlashLlama)
+    __all__.append(FlashCausalLM)
     __all__.append(IDEFICSSharded)
 
+MAMBA_AVAILABLE = True
+try:
+    from text_generation_server.models.mamba import Mamba
+except ImportError as e:
+    logger.warning(f"Could not import Mamba: {e}")
+    MAMBA_AVAILABLE = False
+
+if MAMBA_AVAILABLE:
+    __all__.append(Mamba)
+
+
+class ModelType(enum.Enum):
+    IDEFICS2 = {
+        "type": "idefics2",
+        "name": "Idefics 2",
+        "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b",
+        "multimodal": True,
+    }
+    LLAVA_NEXT = {
+        "type": "llava_next",
+        "name": "Llava Next (1.6)",
+        "url": "https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf",
+        "multimodal": True,
+    }
+    LLAMA = {
+        "type": "llama",
+        "name": "Llama",
+        "url": "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct",
+    }
+    PHI3 = {
+        "type": "phi3",
+        "name": "Phi 3",
+        "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
+    }
+    GEMMA = {
+        "type": "gemma",
+        "name": "Gemma",
+        "url": "https://huggingface.co/google/gemma-7b",
+    }
+    PALIGEMMA = {
+        "type": "paligemma",
+        "name": "PaliGemma",
+        "url": "https://huggingface.co/google/paligemma-3b-pt-224",
+    }
+    GEMMA2 = {
+        "type": "gemma2",
+        "name": "Gemma2",
+        "url": "https://huggingface.co/google/gemma2-9b",
+    }
+    COHERE = {
+        "type": "cohere",
+        "name": "Cohere",
+        "url": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
+    }
+    DBRX = {
+        "type": "dbrx",
+        "name": "Dbrx",
+        "url": "https://huggingface.co/databricks/dbrx-instruct",
+    }
+    MAMBA = {
+        "type": "ssm",
+        "name": "Mamba",
+        "url": "https://huggingface.co/state-spaces/mamba-2.8b-slimpj",
+    }
+    MISTRAL = {
+        "type": "mistral",
+        "name": "Mistral",
+        "url": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
+    }
+    MIXTRAL = {
+        "type": "mixtral",
+        "name": "Mixtral",
+        "url": "https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1",
+    }
+    GPT_BIGCODE = {
+        "type": "gpt_bigcode",
+        "name": "Gpt Bigcode",
+        "url": "https://huggingface.co/bigcode/gpt_bigcode-santacoder",
+    }
+    PHI = {
+        "type": "phi",
+        "name": "Phi",
+        "url": "https://huggingface.co/microsoft/phi-1_5",
+    }
+    BAICHUAN = {
+        "type": "baichuan",
+        "name": "Baichuan",
+        "url": "https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat",
+    }
+    FALCON = {
+        "type": "falcon",
+        "name": "Falcon",
+        "url": "https://huggingface.co/tiiuae/falcon-7b-instruct",
+    }
+    STARCODER2 = {
+        "type": "starcoder2",
+        "name": "StarCoder 2",
+        "url": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
+    }
+    QWEN2 = {
+        "type": "qwen2",
+        "name": "Qwen 2",
+        "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f",
+    }
+    OPT = {
+        "type": "opt",
+        "name": "Opt",
+        "url": "https://huggingface.co/facebook/opt-6.7b",
+    }
+    T5 = {
+        "type": "t5",
+        "name": "T5",
+        "url": "https://huggingface.co/google/flan-t5-xxl",
+    }
+    GALACTICA = {
+        "type": "galactica",
+        "name": "Galactica",
+        "url": "https://huggingface.co/facebook/galactica-120b",
+    }
+    SANTACODER = {
+        "type": "santacoder",
+        "name": "SantaCoder",
+        "url": "https://huggingface.co/bigcode/santacoder",
+    }
+    BLOOM = {
+        "type": "bloom",
+        "name": "Bloom",
+        "url": "https://huggingface.co/bigscience/bloom-560m",
+    }
+    MPT = {
+        "type": "mpt",
+        "name": "Mpt",
+        "url": "https://huggingface.co/mosaicml/mpt-7b-instruct",
+    }
+    GPT2 = {
+        "type": "gpt2",
+        "name": "Gpt2",
+        "url": "https://huggingface.co/openai-community/gpt2",
+    }
+    GPT_NEOX = {
+        "type": "gpt_neox",
+        "name": "Gpt Neox",
+        "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
+    }
+    IDEFICS = {
+        "type": "idefics",
+        "name": "Idefics",
+        "url": "https://huggingface.co/HuggingFaceM4/idefics-9b",
+        "multimodal": True,
+    }
+
+
+__GLOBALS = locals()
+for data in ModelType:
+    __GLOBALS[data.name] = data.value["type"]
+
 
 def get_model(
     model_id: str,
+    lora_adapter_ids: Optional[List[str]],
     revision: Optional[str],
     sharded: bool,
     quantize: Optional[str],
+    speculate: Optional[int],
     dtype: Optional[str],
     trust_remote_code: bool,
+    max_input_tokens: int,
 ) -> Model:
+    global FLASH_ATTENTION
     if dtype is None:
-        dtype = torch.float16
+        if quantize in ["awq", "exl2", "gptq", "marlin"]:
+            # These quantizers only work with float16 params.
+            dtype = torch.float16
+        else:
+            # Keep it as default for now and let
+            # every model resolve their own default dtype.
+            dtype = None
     elif dtype == "float16":
         dtype = torch.float16
     elif dtype == "bfloat16":
@@ -85,206 +313,719 @@ def get_model(
     else:
         raise RuntimeError(f"Unknown dtype {dtype}")
 
-    if "facebook/galactica" in model_id:
-        return GalacticaSharded(
-            model_id,
-            revision,
-            quantize=quantize,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    if model_id.startswith("bigcode/"):
-        if FLASH_ATTENTION:
-            return FlashSantacoderSharded(
-                model_id,
-                revision,
-                quantize=quantize,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
-            )
-        elif sharded:
-            raise NotImplementedError(
-                FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
-            )
-        else:
-            return SantaCoder(
-                model_id,
-                revision,
-                quantize=quantize,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
-            )
+    if speculate is not None:
+        set_speculate(speculate)
+    else:
+        set_speculate(0)
 
     config_dict, _ = PretrainedConfig.get_config_dict(
         model_id, revision=revision, trust_remote_code=trust_remote_code
     )
-    model_type = config_dict["model_type"]
+    model_type = config_dict.get("model_type", None)
 
-    if model_type == "gpt_bigcode":
+    speculator = None
+    if "medusa_num_heads" in config_dict:
+        medusa_model_id = model_id
+        medusa_revision = revision
+        model_id = config_dict["base_model_name_or_path"]
+        revision = "main"
+        speculate_medusa = config_dict["medusa_num_heads"]
+        if speculate is not None:
+            if speculate > speculate_medusa:
+                raise RuntimeError(
+                    f"Speculate is set to `{speculate}` but this medusa models only has `{speculate_medusa}` heads, please make them match"
+                )
+            else:
+                set_speculate(speculate)
+        else:
+            set_speculate(speculate_medusa)
+
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
+        is_local = Path(medusa_model_id).exists()
+        if not is_local:
+            medusa_config = hf_hub_download(
+                medusa_model_id, revision=medusa_revision, filename="config.json"
+            )
+            hf_hub_download(
+                medusa_model_id,
+                revision=medusa_revision,
+                filename="medusa_lm_head.safetensors",
+            )
+            speculator = {
+                "path": Path(medusa_config).parent,
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
+        else:
+            speculator = {
+                "path": Path(medusa_model_id),
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
+
+        method = "medusa"
+    elif model_type == "mlp_speculator":
+        mlp_model_id = model_id
+        mlp_revision = revision
+        model_id = config_dict["base_model_name_or_path"]
+        revision = "main"
+        speculate_mlp = config_dict["n_predict"]
+        if speculate is not None:
+            if speculate > speculate_mlp:
+                raise RuntimeError(
+                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
+                )
+            else:
+                set_speculate(speculate)
+        else:
+            set_speculate(speculate_mlp)
+
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
+        is_local = Path(mlp_model_id).exists()
+        extension = ".safetensors"
+        if not is_local:
+            mlp_speculator_config = hf_hub_download(
+                mlp_model_id, revision=mlp_revision, filename="config.json"
+            )
+            api = HfApi()
+            info = api.model_info(mlp_model_id, revision=mlp_revision)
+            filenames = [
+                s.rfilename
+                for s in info.siblings
+                if s.rfilename.endswith(extension)
+                and len(s.rfilename.split("/")) == 1
+                and "arguments" not in s.rfilename
+                and "args" not in s.rfilename
+                and "training" not in s.rfilename
+            ]
+            for filename in filenames:
+                hf_hub_download(
+                    mlp_model_id,
+                    revision=mlp_revision,
+                    filename=filename,
+                )
+            speculator = {
+                "path": Path(mlp_speculator_config).parent,
+                "model_paths": filenames,
+            }
+        else:
+            speculator = Path(mlp_model_id)
+            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
+            speculator = {"path": speculator, "model_paths": filenames}
+        method = "mlp_speculator"
+    else:
+        method = "n-gram"
+
+    speculate = get_speculate()
+    if speculate > 0:
+        logger.info(f"Using speculation {method} with {speculate} input ids.")
+
+    if model_type is None:
+        # TODO: fix how we determine model type for Mamba
+        if "ssm_cfg" in config_dict:
+            # *only happens in Mamba case
+            model_type = "ssm"
+        else:
+            raise RuntimeError(
+                f"Could not determine model type for {model_id} revision {revision}"
+            )
+    quantization_config = config_dict.get("quantization_config", None)
+    if quantization_config is not None and quantize is None:
+        method = quantization_config.get("quant_method", None)
+        if method in {"gptq", "awq", "exl2"}:
+            logger.info(f"Auto selecting quantization method {method}")
+            quantize = method
+        else:
+            logger.info(f"Unknown quantization method {method}")
+
+    if quantize == "exl2" and sharded:
+        raise RuntimeError(
+            "Sharding is currently not supported with `exl2` quantization"
+        )
+    sliding_window = config_dict.get("sliding_window", -1)
+
+    if (
+        (sliding_window is not None and sliding_window != -1)
+        and not SUPPORTS_WINDOWING
+        and max_input_tokens > sliding_window
+    ):
+        raise ValueError(
+            f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
+        )
+
+    if model_type == MAMBA:
+        return Mamba(
+            model_id,
+            revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+        )
+
+    if model_id.startswith("facebook/galactica"):
+        return CausalLM(
+            model_id=model_id,
+            # Yes galactica is just an OPT model.
+            model_class=OPTForCausalLM,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+            batch_class=GalacticaCausalLMBatch,
+        )
+
+    if (
+        model_type == GPT_BIGCODE
+        or model_type == GPT2
+        and model_id.startswith("bigcode/")
+    ):
         if FLASH_ATTENTION:
-            return FlashSantacoderSharded(
-                model_id,
-                revision,
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashSantacoderForCausalLM,
+                revision=revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                aliases={"transformer.wte.weight": ["lm_head.weight"]},
+                num_kv_heads=1,
             )
         elif sharded:
             raise NotImplementedError(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
             )
         else:
-            return SantaCoder(
-                model_id,
-                revision,
+            return CausalLM.fallback(
+                model_id=model_id,
+                revision=revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
-    if model_type == "bloom":
-        return BLOOMSharded(
-            model_id,
-            revision,
+    if model_type == BLOOM:
+        return CausalLM(
+            model_id=model_id,
+            model_class=BloomForCausalLM,
+            revision=revision,
             quantize=quantize,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
+            batch_class=BloomCausalLMBatch,
         )
-    elif model_type == "mpt":
-        return MPTSharded(
-            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
+    elif model_type == MPT:
+        return CausalLM(
+            model_id=model_id,
+            model_class=MPTForCausalLM,
+            revision=revision,
+            quantize=quantize,
+            speculator=speculator,
+            dtype=dtype,
+            trust_remote_code=trust_remote_code,
+            batch_class=CausalLMBatchKeysLast,
         )
-
-    elif model_type == "gpt_neox":
+    elif model_type == GPT2:
         if FLASH_ATTENTION:
-            return FlashNeoXSharded(
+            try:
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashGPT2ForCausalLM,
+                    revision=revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                )
+            except RuntimeError as e:
+                # Lots of legacy models with various weight names.
+                logger.warning(f"Couldn't load flash gpt2 variant: {e}")
+                return CausalLM.fallback(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
+        else:
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
-        elif sharded:
-            return GPTNeoxSharded(
-                model_id,
-                revision,
+    elif model_type == GPT_NEOX:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGPTNeoXForCausalLM,
+                revision=revision,
                 quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            return CausalLM(
+                model_id=model_id,
+                model_class=GPTNeoxForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
-    elif model_type == "llama":
+    elif model_type == PHI:
         if FLASH_ATTENTION:
-            return FlashLlama(
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashPhiForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        else:
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
+
+    elif model_type == "phi-msft":
+        if FLASH_ATTENTION:
+            raise NotImplementedError(
+                "Legacy phi-msft is not supported with Flash Attention"
+            )
+        else:
+            return CausalLM(
+                model_id=model_id,
+                model_class=PhiForCausalLM,
+                config_class=PhiConfig,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    elif model_type == LLAMA or model_type == BAICHUAN or model_type == PHI3:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashLlamaForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama"))
         else:
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+    if model_type == GEMMA:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemmaForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+    elif model_type == GEMMA2:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashGemma2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
-    if model_type in ["RefinedWeb", "RefinedWebModel", "falcon"]:
+    if model_type == COHERE:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashCohereForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Cohere"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    if model_type == DBRX:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashDbrxForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                # Dbrx works better in bfloat16.
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                config_class=DbrxConfig,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded DBRX"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    if model_type in ["RefinedWeb", "RefinedWebModel", FALCON]:
         if sharded:
             if FLASH_ATTENTION:
                 if config_dict.get("alibi", False):
                     raise NotImplementedError("sharded is not supported for this model")
-                return FlashRWSharded(
-                    model_id,
-                    revision,
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashRWForCausalLM,
+                    revision=revision,
                     quantize=quantize,
+                    speculator=speculator,
                     dtype=dtype,
+                    aliases={
+                        "lm_head.weight": ["transformer.word_embeddings.weight"],
+                        "transformer.word_embeddings.weight": ["lm_head.weight"],
+                    },
                     trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                    config_class=RWConfig,
                 )
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format(f"Sharded Falcon"))
         else:
             if FLASH_ATTENTION and not config_dict.get("alibi", False):
-                return FlashRWSharded(
-                    model_id,
-                    revision,
+                return FlashCausalLM(
+                    model_id=model_id,
+                    model_class=FlashRWForCausalLM,
+                    revision=revision,
                     quantize=quantize,
+                    speculator=speculator,
                     dtype=dtype,
                     trust_remote_code=trust_remote_code,
+                    lora_adapter_ids=lora_adapter_ids,
+                    config_class=RWConfig,
                 )
             else:
-                return RW(
+                return CausalLM.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
+                    speculator=speculator,
                     dtype=dtype,
                     trust_remote_code=trust_remote_code,
                 )
 
-    elif model_type == "opt":
-        return OPTSharded(
-            model_id,
-            revision,
+    if model_type == MISTRAL:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashMistralForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mistral"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    if model_type == MIXTRAL:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashMixtralForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    if model_type == STARCODER2:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=FlashStarcoder2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format("Sharded Starcoder2")
+            )
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    if model_type == QWEN2:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=Qwen2ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen2"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
+    if model_type == OPT:
+        return CausalLM(
+            model_id=model_id,
+            model_class=OPTForCausalLM,
+            revision=revision,
             quantize=quantize,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
 
-    elif model_type == "t5":
-        return T5Sharded(
-            model_id,
-            revision,
+    if model_type == T5:
+        return Seq2SeqLM(
+            model_id=model_id,
+            model_class=T5ForConditionalGeneration,
+            revision=revision,
             quantize=quantize,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
+            aliases={
+                "shared.weight": [
+                    "encoder.embed_tokens.weight",
+                    "decoder.embed_tokens.weight",
+                ]
+            },
         )
-    elif model_type == "idefics":
+    if model_type == IDEFICS:
         if FLASH_ATTENTION:
-           return IDEFICSSharded(
-               model_id,
-               revision,
-               quantize=quantize,
-               dtype=dtype,
-               trust_remote_code=trust_remote_code,
-           )
+            return IDEFICSSharded(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+    if model_type == IDEFICS2:
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=Idefics2ForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                # XXX: Extremely important to cap resolution in order to limit
+                # VRAM usage.
+                processor_kwargs={"size": {"longest_edge": 448, "shortest_edge": 378}},
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+    if model_type == PALIGEMMA:
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_id=model_id,
+                model_class=PaliGemmaForConditionalGeneration,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                # Works better for these models
+                default_dtype=torch.bfloat16,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+                batch_class=PaliGemmaBatch,
+            )
         else:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
 
+    if model_type == LLAVA_NEXT:
+        if FLASH_ATTENTION:
+            return VlmCausalLM(
+                model_class=LlavaNextForConditionalGeneration,
+                model_id=model_id,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("LlavaNext"))
+
     if sharded:
-        raise ValueError("sharded is not supported for AutoModel")
+        raise NotImplementedError("sharded is not supported for AutoModel")
     if quantize == "gptq":
-        raise ValueError(
+        raise NotImplementedError(
             "gptq quantization is not supported for AutoModel, you can try to quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
         )
+    if quantize == "awq":
+        raise NotImplementedError("awq quantization is not supported for AutoModel")
     elif (quantize == "bitsandbytes-fp4") or (quantize == "bitsandbytes-nf4"):
-        raise ValueError(
-            "4bit quantization is not supported for AutoModel"
-        )
+        raise NotImplementedError("4bit quantization is not supported for AutoModel")
+    elif quantize == "eetq":
+        raise NotImplementedError("Eetq quantization is not supported for AutoModel")
+    elif quantize == "exl2":
+        raise NotImplementedError("exl2 quantization is not supported for AutoModel")
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return CausalLM(
+        return CausalLM.fallback(
             model_id,
             revision,
             quantize=quantize,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
     if model_type in modeling_auto.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
-        return Seq2SeqLM(
+        return Seq2SeqLM.fallback(
             model_id,
             revision,
             quantize=quantize,
+            speculator=speculator,
             dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
@@ -292,18 +1033,20 @@ def get_model(
     auto_map = config_dict.get("auto_map", None)
     if trust_remote_code and auto_map is not None:
         if "AutoModelForCausalLM" in auto_map.keys():
-            return CausalLM(
+            return CausalLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
         if "AutoModelForSeq2SeqLM" in auto_map.keys():
-            return Seq2SeqLM(
+            return Seq2SeqLM.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
+                speculator=speculator,
                 dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 79fb60c6..732b4c53 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -4,22 +4,12 @@ import torch.distributed
 from typing import Optional, Type
 
 from transformers import (
-    AutoTokenizer,
-    AutoConfig,
     PreTrainedTokenizerBase,
 )
 
-from text_generation_server.models.custom_modeling.bloom_modeling import (
-    BloomForCausalLM,
-)
 from text_generation_server.models import CausalLM
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.pb import generate_pb2
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
 
 
 class BloomCausalLMBatch(CausalLMBatch):
@@ -37,61 +27,6 @@ class BloomCausalLMBatch(CausalLMBatch):
 
 
 class BLOOMSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            slow_but_exact=False,
-            tp_parallel=True,
-            trust_remote_code=trust_remote_code,
-        )
-        config.pad_token_id = 3
-        config.quantize = quantize
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = BloomForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
     @property
     def batch_type(self) -> Type[CausalLMBatch]:
         return BloomCausalLMBatch
@@ -99,7 +34,7 @@ class BLOOMSharded(CausalLM):
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
     ):
-        outputs = self.model.forward(
+        outputs, speculative_logits = self.model.forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -108,4 +43,4 @@ class BLOOMSharded(CausalLM):
         )
 
         logits = outputs.logits
-        return logits, outputs.past_key_values
+        return logits, speculative_logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index 4e338263..868a3cc0 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -1,19 +1,31 @@
-from text_generation_server.utils.tokens import batch_top_tokens
 import torch
-import inspect
+import time
+import torch.distributed
 
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    PreTrainedTokenizerBase,
+)
 from typing import Optional, Tuple, List, Type, Dict
 
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
 from text_generation_server.models import Model
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.models.types import (
     Batch,
-    PrefillTokens,
+    Tokens,
     Generation,
     GeneratedText,
-    TopTokens,
 )
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
@@ -87,8 +99,11 @@ class CausalLMBatch(Batch):
         max_decode_tokens = 0
         for i, r in enumerate(pb.requests):
             requests_idx_mapping[r.id] = i
-            inputs.append(r.inputs)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
+
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
@@ -414,14 +429,14 @@ class CausalLMBatch(Batch):
                 # We slice the keys to remove the padding from previous batches
                 past_seq_len = batch.max_input_length - 1
                 if batch.keys_head_dim_last:
-                    padded_past_keys[
-                        start_index:end_index, :, -past_seq_len:, :
-                    ] = past_keys[:, :, -past_seq_len:, :]
+                    padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
+                        past_keys[:, :, -past_seq_len:, :]
+                    )
                 else:
                     # BLOOM case
-                    padded_past_keys[
-                        start_index:end_index, :, :, -past_seq_len:
-                    ] = past_keys[:, :, :, -past_seq_len:]
+                    padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
+                        past_keys[:, :, :, -past_seq_len:]
+                    )
                 del past_keys
 
                 start_index = end_index
@@ -439,9 +454,9 @@ class CausalLMBatch(Batch):
                 end_index = start_index + len(batch)
                 # We slice the past values to remove the padding from previous batches
                 past_seq_len = batch.max_input_length - 1
-                padded_past_values[
-                    start_index:end_index, :, -past_seq_len:, :
-                ] = past_values[:, :, -past_seq_len:, :]
+                padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
+                    past_values[:, :, -past_seq_len:, :]
+                )
                 del past_values
 
                 # Update values
@@ -475,15 +490,97 @@ class CausalLMBatch(Batch):
         return len(self.requests)
 
 
+@dataclass
+class CausalLMBatchKeysLast(Batch):
+    keys_head_dim_last: bool = False
+
+
 class CausalLM(Model):
     def __init__(
         self,
         model_id: str,
+        model_class,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        config_class=AutoConfig,
+        batch_class=CausalLMBatch,
+    ):
+        self.batch_class = batch_class
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+
+        config = config_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = config.pad_token_id
+
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames, device=device, dtype=dtype, process_group=self.process_group
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+
+        prefix = ""
+        model = model_class(prefix, config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16 if dtype is None else dtype
@@ -492,7 +589,7 @@ class CausalLM(Model):
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
@@ -505,13 +602,19 @@ class CausalLM(Model):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
+        if (
+            torch.cuda.is_available()
+            and torch.cuda.device_count() == 1
+            and quantize != "bitsandbytes"
+        ):
             model = model.cuda()
 
         if tokenizer.pad_token_id is None:
@@ -524,26 +627,30 @@ class CausalLM(Model):
             else:
                 tokenizer.add_special_tokens({"pad_token": "[PAD]"})
 
-        super(CausalLM, self).__init__(
+        self = cls.__new__(
+            cls,
+        )
+        self.batch_class = CausalLMBatch
+        super().__init__(
+            self,
+            model_id=model_id,
             model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
         )
+        return self
 
     @property
     def batch_type(self) -> Type[CausalLMBatch]:
-        return CausalLMBatch
-
-    def decode(self, generated_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
+        return self.batch_class
 
     def forward(
         self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+    ) -> Tuple[
+        torch.Tensor, Optional[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]
+    ]:
         # Model Forward
         kwargs = {
             "input_ids": input_ids,
@@ -556,16 +663,21 @@ class CausalLM(Model):
             kwargs["position_ids"] = position_ids
 
         outputs = self.model.forward(**kwargs)
-        return outputs.logits, outputs.past_key_values
+        if isinstance(outputs, tuple):
+            outputs, speculative_logits = outputs
+        else:
+            speculative_logits = None
+        return outputs.logits, speculative_logits, outputs.past_key_values
 
     @tracer.start_as_current_span("generate_token")
     def generate_token(
         self, batch: CausalLMBatch
-    ) -> Tuple[List[Generation], Optional[CausalLMBatch]]:
+    ) -> Tuple[List[Generation], Optional[CausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
         # slice the attention mask to the correct shape
         attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
 
-        logits, past = self.forward(
+        logits, speculative_logits, past = self.forward(
             batch.input_ids,
             attention_mask,
             batch.position_ids,
@@ -576,12 +688,17 @@ class CausalLM(Model):
         generations: List[Generation] = []
         stopped = True
 
+        # Speculation is not active for causal
+        accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
         batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
             batch.top_n_tokens,
             batch.top_n_tokens_tensor,
-            torch.softmax(logits[:, -1], -1),
+            torch.log_softmax(logits[:, -1], -1),
+            accepted_ids,
         )
 
+        start_decode = time.time_ns()
+
         # Zipped iterator
         iterator = zip(
             batch.requests,
@@ -641,8 +758,14 @@ class CausalLM(Model):
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
                     )
                     # Get seed
                     if isinstance(next_token_chooser.choice, Sampling):
@@ -670,37 +793,49 @@ class CausalLM(Model):
                         clean_up_tokenization_spaces=False,
                         skip_special_tokens=False,
                     )
-                    prefill_tokens = PrefillTokens(
-                        prefill_token_ids, prefill_logprobs, prefill_texts
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
                     )
                 else:
                     prefill_tokens = None
 
                 if top_n_tokens > 0:
-                    toptoken_texts = self.tokenizer.batch_decode(
-                        top_token_ids,
-                        clean_up_tokenization_spaces=False,
-                        skip_special_tokens=False,
-                    )
-                    special_toptokens = [
-                        token_id in self.all_special_ids for token_id in top_token_ids
-                    ]
-                    top_tokens = TopTokens(
-                        top_token_ids,
-                        top_token_logprobs,
-                        toptoken_texts,
-                        special_toptokens,
-                    )
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
                 else:
                     top_tokens = None
 
                 generation = Generation(
                     request.id,
                     prefill_tokens,
-                    next_token_id_squeezed,
-                    next_token_logprob,
-                    next_token_text,
-                    next_token_id_squeezed.item() in self.all_special_ids,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
                     generated_text,
                     top_tokens,
                 )
@@ -708,6 +843,9 @@ class CausalLM(Model):
                 generations.append(generation)
 
             # Update values
+            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
+                next_token_id_squeezed.item()
+            )
             batch.input_ids[i, 0] = next_token_id
             batch.all_input_ids[i] = all_input_ids
             batch.input_lengths[i] = new_input_length
@@ -717,7 +855,9 @@ class CausalLM(Model):
 
         # We finished all generations in the batch; there is no next batch
         if stopped:
-            return generations, None
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
 
         # Slice unused values from prefill
         batch.input_ids = batch.input_ids[:, :1]
@@ -733,4 +873,6 @@ class CausalLM(Model):
         # Update past key values
         batch.past_key_values = past
 
-        return generations, batch
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/server/text_generation_server/models/custom_modeling/bloom_modeling.py b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
index 047a1872..77b89c5b 100644
--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -32,15 +32,18 @@ from transformers.modeling_outputs import (
 )
 from transformers import BloomConfig, PreTrainedModel
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
 
 CUSTOM_KERNELS_ENABLED = False
-if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
+if (
+    torch.cuda.is_available()
+    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
+):
     try:
         from custom_kernels import fused_bloom_attention_cuda
 
@@ -813,11 +816,11 @@ class BloomModel(BloomPreTrainedModel):
 
 
 class BloomForCausalLM(BloomPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.transformer = BloomModel(config, weights)
 
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
             config,
             prefix="word_embeddings",
             weights=weights,
@@ -867,7 +870,7 @@ class BloomForCausalLM(BloomPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **deprecated_arguments,
-    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
@@ -901,17 +904,20 @@ class BloomForCausalLM(BloomPreTrainedModel):
         )
         hidden_states = transformer_outputs[0]
 
-        lm_logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
         loss = None
 
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
+        return (
+            CausalLMOutputWithCrossAttentions(
+                loss=loss,
+                logits=logits,
+                past_key_values=transformer_outputs.past_key_values,
+                hidden_states=transformer_outputs.hidden_states,
+                attentions=transformer_outputs.attentions,
+            ),
+            speculative_logits,
         )
diff --git a/server/text_generation_server/models/custom_modeling/clip.py b/server/text_generation_server/models/custom_modeling/clip.py
new file mode 100644
index 00000000..27b9ff1c
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/clip.py
@@ -0,0 +1,827 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import (
+    _create_4d_causal_attention_mask,
+    _prepare_4d_attention_mask,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+
+from text_generation_server.layers import (
+    TensorParallelEmbedding,
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+
+class CLIPVisionEmbeddings(nn.Module):
+    def __init__(self, prefix, config: CLIPVisionConfig, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        # TODO Should we TP this ?
+        self.class_embedding = weights.get_tensor(f"{prefix}.class_embedding")
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(
+            config.max_position_embeddings, embed_dim
+        )
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = (
+            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.embed_dim // self.num_heads
+        if self.head_size * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+        self.scale = self.head_size**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=True,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_size)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+
+        qkv = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv.split(
+            [
+                self.head_size * self.num_heads,
+            ]
+            * 3,
+            dim=2,
+        )
+        query_states = query_states * self.scale
+        key_states = self._shape(key_states, -1, bsz)
+        value_states = self._shape(value_states, -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_size)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None
+
+
+class CLIPMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, prefix, config: CLIPConfig, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
+        )
+        self.mlp = CLIPMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class CLIPPreTrainedModel(nn.Module):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLIPConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+
+
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+"""
+
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+"""
+
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+"""
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(self, prefix, config: CLIPConfig, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+        """
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+            )
+
+        return hidden_states
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, prefix: str, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Returns:
+
+        """
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask, hidden_states.dtype
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(
+                    last_hidden_state.shape[0], device=last_hidden_state.device
+                ),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(
+                    dim=-1
+                ),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(
+                    last_hidden_state.shape[0], device=last_hidden_state.device
+                ),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (
+                    input_ids.to(dtype=torch.int, device=last_hidden_state.device)
+                    == self.eos_token_id
+                )
+                .int()
+                .argmax(dim=-1),
+            ]
+
+        return last_hidden_state
+
+
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+
+    def __init__(self, prefix, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(prefix, config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, prefix, config: CLIPVisionConfig, weights):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.pre_layrnorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
+        )
+        self.encoder = CLIPEncoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        # self.post_layernorm = nn.LayerNorm.load(prefix=f"{prefix}.post_layernorm", weights=weights, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        """
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+        last_hidden_state = encoder_outputs
+        # pooled_output = last_hidden_state[:, 0, :]
+        # pooled_output = self.post_layernorm(pooled_output)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            # pooler_output=pooled_output,
+            # hidden_states=encoder_outputs,
+        )
+
+
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+
+class CLIPModel(nn.Module):
+    def __init__(self, prefix, config: CLIPConfig, weights):
+        super().__init__()
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.tensor(self.config.logit_scale_init_value)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        return logits_per_image, logits_per_text
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
new file mode 100644
index 00000000..f993fe72
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -0,0 +1,543 @@
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.models.globals import FLASH_DECODING
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+else:
+    dropout_layer_norm = None
+
+
+class CohereRotary(PositionRotaryEmbedding):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            import rotary_emb
+
+            q1 = query[..., ::2]
+            q2 = query[..., 1::2]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., ::2]
+            k2 = key[..., 1::2]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            from vllm._C import ops
+
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+
+class CohereLayerNorm(nn.Module):
+    def __init__(self, prefix, weights, eps):
+        super().__init__()
+        weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+        self.weight = nn.Parameter(weight)
+        # Fake weights
+        self.ones = weight.new_ones(weight.shape[1])
+        self.eps = eps
+
+    def forward(self, hidden_states):
+        if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
+            hidden_states = hidden_states.reshape(
+                -1, self.weight.shape[0], self.weight.shape[1]
+            )
+            input_dtype = hidden_states.dtype
+            hidden_states = hidden_states.to(torch.float32)
+            mean = hidden_states.mean(-1, keepdim=True)
+            hidden_states_minus_mean = hidden_states - mean
+            variance = hidden_states_minus_mean.pow(2).mean(-1, keepdim=True)
+            hidden_states = hidden_states_minus_mean * torch.rsqrt(variance + self.eps)
+            hidden_states = self.weight.to(torch.float32) * hidden_states
+            hidden_states = hidden_states.view(-1, self.weight.shape[1])
+            return hidden_states.to(input_dtype)
+
+        (
+            hidden_states,
+            *rest,
+        ) = dropout_layer_norm.dropout_add_ln_fwd(
+            hidden_states,
+            None,
+            self.ones,
+            None,
+            None,
+            None,
+            None,
+            None,
+            0.0,
+            self.eps,
+            1.0,
+            0,
+            None,
+            False,
+            False,
+        )
+
+        # Required to apply one weight matrix per head
+        hidden_states = hidden_states.view(
+            -1, self.weight.shape[0], self.weight.shape[1]
+        )
+        hidden_states = self.weight * hidden_states
+        hidden_states = hidden_states.view(-1, self.weight.shape[1])
+
+        return hidden_states
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=config.attention_bias,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    if config.attention_bias:
+        w = [
+            weights.get_sharded(f"{p}.bias", dim=0)
+            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+        ]
+        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=bias, quantize=config.quantize)
+    )
+
+
+class FlashCohereAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = CohereRotary.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            self.q_norm = CohereLayerNorm(
+                prefix=f"{prefix}.q_norm",
+                weights=weights,
+                eps=config.layer_norm_eps,
+            )
+            self.k_norm = CohereLayerNorm(
+                prefix=f"{prefix}.k_norm",
+                weights=weights,
+                eps=config.layer_norm_eps,
+            )
+        else:
+            self.q_norm = None
+            self.k_norm = None
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=config.attention_bias,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        input_lengths,
+        slots,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, key, value = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                self.head_size * self.num_key_value_heads,
+                self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+
+        if self.use_qk_norm:
+            query = query.reshape(-1, self.head_size)
+            key = key.reshape(-1, self.head_size)
+            query = self.q_norm(query.contiguous())
+            key = self.k_norm(key.contiguous())
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_key_value_heads, self.head_size)
+        value = value.view(-1, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, key, cos, sin)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                key,
+                value,
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), reduce=False
+        )
+
+
+class CohereMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], reduce=False
+        )
+
+
+class FlashCohereLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+        self.self_attn = FlashCohereAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = CohereMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.input_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.process_group = weights.process_group
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        mlp_output = self.mlp(normed_hidden_states)
+        output = attn_output + mlp_output
+
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(output, group=self.process_group)
+
+        return output, res
+
+
+class FlashCohereModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                FlashCohereLayer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashCohereForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = FlashCohereModel(prefix, config, weights)
+        try:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix="lm_head",
+                weights=weights,
+            )
+        except RuntimeError:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix=f"{prefix}.embed_tokens",
+                weights=weights,
+            )
+        self.logit_scale = config.logit_scale
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        logits *= self.logit_scale
+        if speculative_logits is not None:
+            speculative_logits *= self.logit_scale
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
new file mode 100644
index 00000000..e469495f
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -0,0 +1,748 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple, Any
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM != "ipex":
+    from vllm.model_executor.layers.fused_moe import fused_moe
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    FastLinear,
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.utils.log import log_once
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        if uniform_expert_assignment:
+            raise ValueError("`uniform_expert_assignment = True` is not supported")
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+
+class DbrxConfig(PretrainedConfig):
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if len(x.size()) == 0 else x
+
+
+def load_attention(config, prefix, weights):
+    return TensorParallelColumnLinear.load_qkv(
+        config,
+        prefix=f"{prefix}.Wqkv",
+        weights=weights,
+        bias=False,
+        num_heads=config.n_heads,
+        num_key_value_heads=config.attn_config.kv_n_heads,
+    )
+
+
+def _load_experts(config, prefix, weights):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    assert (
+        config.ffn_config.ffn_hidden_size % world_size == 0
+    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"
+
+    expert_size = config.ffn_config.ffn_hidden_size
+    block_size = expert_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+
+    tensor = torch.empty(
+        (config.ffn_config.moe_num_experts * block_size, config.d_model),
+        dtype=weights.dtype,
+        device=weights.device,
+    )
+
+    slice_ = weights._get_slice(f"{prefix}")
+
+    for i in range(config.ffn_config.moe_num_experts):
+        offset = i * expert_size
+        expert_slice = slice_[start + offset : stop + offset]
+
+        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
+            dtype=weights.dtype
+        ).to(device=weights.device)
+    return tensor
+
+
+def _load_experts_quantized(config, prefix, weights, cls):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    assert (
+        config.ffn_config.ffn_hidden_size % world_size == 0
+    ), f"The chosen size {config.ffn_config.ffn_hidden_size} is not compatible with sharding on {world_size} shards"
+
+    expert_size = config.ffn_config.ffn_hidden_size
+    block_size = expert_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+
+    slice_ = weights._get_slice(f"{prefix}")
+
+    experts = []
+    for i in range(config.ffn_config.moe_num_experts):
+        if config.quantize in ["gptq", "awq"]:
+            raise NotImplementedError(
+                "Dbrx does not support gptq/awq quantization yet."
+            )
+        else:
+            offset = i * expert_size
+            expert_slice = (
+                slice_[start + offset : stop + offset]
+                .to(dtype=weights.dtype)
+                .to(device=weights.device)
+            )
+
+        if cls == TensorParallelRowLinear:
+            expert_slice = expert_slice.t().contiguous()
+            linear = get_linear(expert_slice, None, config.quantize)
+            experts.append(cls(linear, weights.process_group))
+        else:
+            linear = get_linear(expert_slice, None, config.quantize)
+            experts.append(cls(linear))
+
+    return experts
+
+
+class DbrxAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.num_heads = config.n_heads
+        self.hidden_size = config.d_model
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.attn_config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.attn_config.kv_n_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        if self.clip_qkv is not None:
+            qkv = qkv.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class DbrxNormAttentionNorm(nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.norm_1 = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm_1", weights=weights, eps=1e-5
+        )
+        self.self_attn = DbrxAttention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.norm_2 = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm_2",
+            weights=weights,
+            eps=1e-5,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        normed_hidden_states, res = self.norm_1(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.norm_2(attn_output, res)
+
+        return normed_attn_res_output, attn_res
+
+
+@torch.jit.script
+def select_experts(
+    gate_logits: torch.Tensor, top_k: int, moe_normalize_expert_weights: int
+):
+    # all_probs: (sequence_length, n_experts) and upcast for softmax
+    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+    # weights, selected_experts: (sequence_length, top-k)
+    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
+    if moe_normalize_expert_weights:
+        weights = weights / torch.norm(
+            weights, p=moe_normalize_expert_weights, dim=-1, keepdim=True
+        )
+    weights = weights.view(-1)
+    selected_experts = selected_experts.view(-1)
+
+    return selected_experts, weights
+
+
+@torch.jit.script
+def round_up(x: torch.Tensor, value: int):
+    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value
+
+
+class BlockSparseMoE(nn.Module):
+    def __init__(self, prefix, config: DbrxConfig, weights):
+        super().__init__()
+        self.moe_normalize_expert_weights = (
+            config.ffn_config.moe_normalize_expert_weights
+        )
+        self.hidden_dim = config.d_model
+        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
+        self.num_experts = config.ffn_config.moe_num_experts
+        self.top_k = config.ffn_config.moe_top_k
+
+        act = config.ffn_config.ffn_act_fn["name"]
+        if "gelu" in act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        elif "silu" in act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[act]
+
+        # gating
+        self.gate = FastLinear.load(
+            config, f"{prefix}.router.layer", weights, bias=False
+        )
+
+        # merged expert weights, all of size  (n_experts * ffn_dim, hidden_dim)
+        w1 = _load_experts(config, f"{prefix}.experts.mlp.w1", weights).view(
+            self.num_experts, self.ffn_dim, self.hidden_dim
+        )
+        v1 = _load_experts(config, f"{prefix}.experts.mlp.v1", weights).view(
+            self.num_experts, self.ffn_dim, self.hidden_dim
+        )
+        self.wv1 = torch.cat([w1, v1], dim=1)
+        self.w2 = (
+            _load_experts(config, f"{prefix}.experts.mlp.w2", weights)
+            .view(self.num_experts, self.ffn_dim, self.hidden_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(x)
+        out = fused_moe(
+            x,
+            self.wv1,
+            self.w2,
+            router_logits,
+            self.top_k,
+            renormalize=self.moe_normalize_expert_weights,
+            inplace=True,
+        )
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DenseMoE(nn.Module):
+    def __init__(self, prefix, config: DbrxConfig, weights):
+        super().__init__()
+
+        self.moe_normalize_expert_weights = (
+            config.ffn_config.moe_normalize_expert_weights
+        )
+        self.hidden_dim = config.d_model
+        self.ffn_dim = config.ffn_config.ffn_hidden_size // weights.process_group.size()
+        self.num_experts = config.ffn_config.moe_num_experts
+        self.top_k = config.ffn_config.moe_top_k
+
+        act = config.ffn_config.ffn_act_fn["name"]
+        if "gelu" in act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        elif "silu" in act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[act]
+
+        # gating
+        self.gate = FastLinear.load(
+            config, f"{prefix}.router.layer", weights, bias=False
+        )
+
+        self.w1 = _load_experts_quantized(
+            config,
+            prefix=f"{prefix}.experts.mlp.w1",
+            weights=weights,
+            cls=TensorParallelColumnLinear,
+        )
+        self.w2 = _load_experts_quantized(
+            config,
+            prefix=f"{prefix}.experts.mlp.w2",
+            weights=weights,
+            cls=TensorParallelRowLinear,
+        )
+        self.v1 = _load_experts_quantized(
+            config,
+            prefix=f"{prefix}.experts.mlp.v1",
+            weights=weights,
+            cls=TensorParallelColumnLinear,
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: (sequence_length, model_dim)
+        gate_logits: (sequence_length, n_experts)
+        """
+        # optional reshape
+        input_shape = x.shape
+        x = x.view(-1, input_shape[-1])
+
+        # gate_logits: (sequence_length, n_experts)
+        gate_logits = self.gate(x)
+        # all_probs: (sequence_length, n_experts) and upcast for softmax
+        weights = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+
+        if self.top_k < self.num_experts:
+            _, not_selected_experts = torch.topk(
+                weights,
+                self.num_experts - self.top_k,
+                largest=False,
+                sorted=False,
+                dim=1,
+            )
+            # Mask not selected experts
+            weights.scatter_(1, not_selected_experts, 0)
+
+        # Re-normalize
+        if self.moe_normalize_expert_weights:
+            weights = weights / torch.norm(
+                weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True
+            )
+        weights = weights.to(x.dtype)
+
+        # Final output tensor
+        out = x.new_zeros(x.shape[0], self.hidden_dim)
+        for i in range(self.num_experts):
+            h = self.act(self.w1[i](x)) * self.v1[i](x)
+            h = self.w2[i](h, reduce=False)
+            # Add expert output to out with masking
+            out += h * weights[:, i].view(-1, 1)
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out
+
+
+class DbrxLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.blocks.{layer_id}"
+
+        self.attn = DbrxNormAttentionNorm(
+            prefix=f"{prefix}.norm_attn_norm", config=config, weights=weights
+        )
+
+        moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
+        self.moe = moe_cls(f"{prefix}.ffn", config, weights)
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        # Self Attention
+        attn_output, attn_res = self.attn(
+            hidden_states,
+            residual,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        moe_output = self.moe(attn_output)
+
+        return moe_output, attn_res
+
+
+class DbrxModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.wte", weights=weights
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DbrxLayer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.n_layers)
+            ]
+        )
+        self.norm = FastLayerNorm.load_no_bias(
+            prefix=f"{prefix}.norm_f", weights=weights, eps=1e-5
+        )
+
+        self.head_size = self.layers[0].attn.self_attn.head_size
+        self.num_heads = self.layers[0].attn.self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].attn.self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].attn.self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashDbrxForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.model = DbrxModel(config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
new file mode 100644
index 00000000..beff08b3
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -0,0 +1,500 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+class Gemma2Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=256128,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class Gemma2FastRMSNorm(FastRMSNorm):
+    @classmethod
+    def load(cls, prefix: str, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        weight = weights.get_tensor(f"{prefix}.weight") + 1
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new
+
+    # perform the multiplication in full precision and downcast after
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.dtype), residual
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.head_dim
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=None, quantize=config.quantize)
+    )
+
+
+class FlashGemma2Attention(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool, is_sliding: bool):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.head_dim
+        self.causal = causal
+        if is_sliding:
+            self.window_size = config.sliding_window
+        else:
+            self.window_size = -1
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        # self.softmax_scale = self.head_size**-0.5
+        self.softmax_scale = config.query_pre_attn_scalar**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                causal=self.causal,
+                window_size_left=self.window_size,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class Gemma2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class FlashGemma2Layer(nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool, is_sliding: bool):
+        super().__init__()
+        self.self_attn = FlashGemma2Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            causal=causal,
+            is_sliding=is_sliding,
+        )
+        self.mlp = Gemma2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.pre_feedforward_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.pre_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.post_feedforward_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.post_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
+        normed_attn_res_output = normed_attn_res_output + res
+        res = normed_attn_res_output
+
+        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
+        mlp_output = self.mlp(pre_normed)
+        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)
+
+        return post_hidden_states, normed_attn_res_output
+
+
+class FlashGemma2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGemma2Layer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    causal=causal,
+                    is_sliding=layer_id % 2 == 0,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGemma2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
+        super().__init__()
+
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemma2Model(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
+        self.lm_head = SpeculativeHead.load(
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            input_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
new file mode 100644
index 00000000..14b62b00
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -0,0 +1,477 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+class GemmaConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=256128,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class GemmaFastRMSNorm(FastRMSNorm):
+    @classmethod
+    def load(cls, prefix: str, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        weight = weights.get_tensor(f"{prefix}.weight") + 1
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new
+
+    # perform the multiplication in full precision and downcast after
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.dtype), residual
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.head_dim
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=None, quantize=config.quantize)
+    )
+
+
+class FlashGemmaAttention(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.head_dim
+        self.causal = causal
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                causal=self.causal,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GemmaMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class FlashGemmaLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+        self.self_attn = FlashGemmaAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
+        )
+        self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = GemmaFastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = GemmaFastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class FlashGemmaModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGemmaLayer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    causal=causal,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = GemmaFastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGemmaForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, *, causal: bool = True):
+        super().__init__()
+
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemmaModel(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
+        self.lm_head = SpeculativeHead.load(
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            input_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
new file mode 100644
index 00000000..d5dc25cf
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@@ -0,0 +1,468 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+
+
+def load_qkv(config, prefix: str, weights, head_size, num_heads):
+    if config.quantize == "gptq":
+        return _load_qkv_gptq(
+            config,
+            prefix,
+            weights,
+        )
+    elif config.quantize == "marlin":
+        raise RuntimeError(
+            "GPT-2 models with marlin quantization are not yet supported"
+        )
+    else:
+        return _load_qkv(config, prefix, weights, head_size, num_heads)
+
+
+def _load_qkv_gptq(config, prefix: str, weights):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    # Weights
+    weight = weights.get_weights_col_packed_qkv(
+        f"{prefix}.c_attn",
+        config.quantize,
+        config.num_attention_heads,
+        config.num_attention_heads,
+    )
+
+    # Bias
+    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+    shape = slice_.get_shape()
+    total_size = shape[0]
+    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
+    single_size = total_size // 3
+    assert single_size % world_size == 0
+    block_size = single_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    tensors = []
+    for i in range(3):
+        tensor = slice_[start + i * single_size : stop + i * single_size]
+        tensors.append(tensor)
+    bias = torch.cat(tensors, dim=0)
+    bias = bias.to(device=weights.device)
+
+    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+
+
+def _load_qkv(config, prefix: str, weights, head_size, num_heads):
+    """Load QKV from a single, transposed matrix."""
+
+    slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
+    shape = slice_.get_shape()
+    total_size = shape[1]
+    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
+    world_size = weights.process_group.size()
+    single_size = total_size // 3
+    assert single_size % world_size == 0
+    rank = weights.process_group.rank()
+
+    # Weights
+    block_size = single_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    tensors = []
+    for i in range(3):
+        tensor = slice_[:, start + i * single_size : stop + i * single_size]
+        tensors.append(tensor)
+    weight = torch.cat(tensors, dim=1).T
+    weight = weight.to(dtype=weights.dtype)
+    weight = weight.to(device=weights.device)
+
+    # Bias
+    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+    shape = slice_.get_shape()
+    total_size = shape[0]
+    single_size = total_size // 3
+    block_size = single_size // world_size
+    assert single_size % world_size == 0
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    b = []
+    for i in range(3):
+        tensor = slice_[start + i * single_size : stop + i * single_size]
+        b.append(tensor)
+    bias = torch.cat(b, dim=0)
+    bias = bias.to(dtype=weights.dtype)
+    bias = bias.to(device=weights.device)
+    assert list(bias.shape) == [
+        3 * num_heads * head_size
+    ], f"{weight.shape} != {[3 * num_heads * head_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    """load_row, but with transposed weight matrices."""
+
+    if config.quantize == "gptq":
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+    else:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    return TensorParallelRowLinear(
+        get_linear(weight, bias, config.quantize), process_group=weights.process_group
+    )
+
+
+def load_col(config, prefix: str, weights, bias: bool):
+    """load_col, but with transposed weight matrices."""
+    if config.quantize == "gptq":
+        weight = weights.get_multi_weights_col(
+            [prefix], quantize=config.quantize, dim=1
+        )
+    else:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
+
+    if bias:
+        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+
+
+class FlashGPT2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+
+        self.head_size = self.hidden_size // self.num_heads
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = load_qkv(
+            config,
+            prefix=prefix,
+            weights=weights,
+            head_size=self.head_size,
+            num_heads=self.num_heads,
+        )
+
+        self.o_proj = load_row(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=True,
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        query, key, value = self.query_key_value(hidden_states).split(
+            self.head_size * self.num_heads, dim=1
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_heads, self.head_size)
+        value = value.view(-1, self.num_heads, self.head_size)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                key,
+                value,
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GPT2MLP(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.c_fc = load_col(
+            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
+        )
+        self.c_proj = load_row(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=True,
+        )
+
+        intermediate_size = (
+            config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.c_proj(hidden_states)
+
+
+class FlashGPT2Layer(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.self_attn = FlashGPT2Attention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.ln_2",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        hidden_states = attn_output + residual
+        residual = hidden_states
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(hidden_states)
+
+        return residual + mlp_output, residual
+
+
+class FlashGPT2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGPT2Layer(
+                    prefix=(
+                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = nn.LayerNorm.load(
+            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class FlashGPT2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=("wte" if not prefix else f"{prefix}.wte"),
+            weights=weights,
+        )
+        self.embed_positions = TensorParallelEmbedding(
+            prefix=("wpe" if not prefix else f"{prefix}.wpe"),
+            weights=weights,
+        )
+
+        self.model = FlashGPT2Model(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="wte" if not prefix else f"{prefix}.wte",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        token_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        inputs_embeds = token_embeds + position_embeds
+        hidden_states = self.model(
+            inputs_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index f0e1236d..78832341 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -18,166 +18,97 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List, Optional, Tuple
+
 import torch
 import torch.distributed
 
 from torch import nn
 from transformers.activations import ACT2FN
-from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
-# Flash attention imports
-import dropout_layer_norm
-
-# vllm imports
-import vllm_cache_ops
-import vllm_attention_ops
-
-from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.models.globals import FLASH_DECODING
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    PositionRotaryEmbedding,
-    TensorParallelHead,
-    get_linear,
+    SpeculativeHead,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
 )
 
+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
 
-class LlamaConfig(PretrainedConfig):
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        rope_theta=10000.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
 
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
+def load_attention(config, prefix: str, weights, layer_id):
+    # Only defined in granite.
+    bias = getattr(config, "attention_bias", False)
+    head_size = config.hidden_size // config.num_attention_heads
+    sizes = None
+    prefixes = None
 
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self.rope_theta = rope_theta
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
+    if config.model_type == "phi3":
+        prefix = f"{prefix}.qkv_proj"
+        base_layer = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=prefix,
+            weights=weights,
+            bias=bias,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+        )
+    elif config.model_type == "baichuan":
+        prefix = f"{prefix}.W_pack"
+        base_layer = TensorParallelColumnLinear.load_qkv(
+            config,
+            prefix=prefix,
+            weights=weights,
+            bias=bias,
+            num_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+        )
+    else:
+        prefixes = ["q_proj", "k_proj", "v_proj"]
+        sizes = [
+            head_size * config.num_attention_heads,
+            head_size * config.num_key_value_heads,
+            head_size * config.num_key_value_heads,
+        ]
+        base_layer = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=bias,
         )
 
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, prefix, weights, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-
-        weight = weights.get_tensor(f"{prefix}.weight")
-        self.weight = nn.Parameter(weight)
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states, residual=None):
-        if hidden_states.shape[-1] > 8192:
-            if residual is not None:
-                hidden_states += residual
-            residual = hidden_states
-
-            hidden_states = hidden_states.to(torch.float32)
-            variance = hidden_states.pow(2).mean(-1, keepdim=True)
-            hidden_states = hidden_states * torch.rsqrt(
-                variance + self.variance_epsilon
-            )
-
-            # convert into half-precision if necessary
-            if self.weight.dtype in [torch.float16, torch.bfloat16]:
-                hidden_states = hidden_states.to(self.weight.dtype)
-
-            return self.weight * hidden_states, residual
-        else:
-            # faster post attention rms norm
-            normed_hidden_states, res, *rest = dropout_layer_norm.dropout_add_ln_fwd(
-                hidden_states,
-                residual,
-                self.weight,
-                None,
-                None,
-                None,
-                None,
-                None,
-                0.0,
-                self.variance_epsilon,
-                1.0,
-                0,
-                None,
-                False,
-                True,  # Activate RMSNorm
-            )
-            if res is None:
-                res = hidden_states
-
-            return normed_hidden_states, res
-
-
-def _load_gqa(config, prefix: str, weights):
-    assert config.hidden_size % config.num_attention_heads == 0
-    assert config.num_attention_heads % weights.process_group.size() == 0
-
-    weight = weights.get_multi_weights_col(
-        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
-        dim=0,
-    )
-
-    if config.quantize != "gptq":
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
-
-        head_size = config.hidden_size // config.num_attention_heads
-        num_heads = config.num_attention_heads // weights.process_group.size()
-        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
-        assert list(weight.shape) == [
-            (num_heads + 2 * num_key_value_heads) * head_size,
-            config.hidden_size,
-        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
-
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=None, quantize=config.quantize)
+    return TensorParallelMultiAdapterLinear.load(
+        base_layer=base_layer,
+        layer_id=layer_id,
+        layer_names=prefixes,
+        sizes=sizes,
+        process_group=weights.process_group,
     )
 
 
 class FlashLlamaAttention(torch.nn.Module):
     def __init__(
         self,
+        index: int,
         prefix: str,
         config,
         weights,
@@ -187,11 +118,16 @@ class FlashLlamaAttention(torch.nn.Module):
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
 
-        # self.rotary_emb = PositionRotaryEmbedding.load(
-        #     config=config, prefix=f"{prefix}.rotary_emb", weights=weights
-        # )
+        # Setting defaults for baichuan custom config which doesn't apply them.
+        config.rope_theta = getattr(config, "rope_theta", 10000)
+        config.num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
         self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=config.rope_theta, device=weights.device
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
         )
 
         self.softmax_scale = self.head_size**-0.5
@@ -201,26 +137,33 @@ class FlashLlamaAttention(torch.nn.Module):
                 f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
                 f"and `num_shards`: {weights.process_group.size()}"
             )
+        if config.num_key_value_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_key_value_heads` must be divisible by `num_shards` (got `num_key_value_heads`: {config.num_key_value_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
         self.num_heads = self.num_heads // weights.process_group.size()
         self.num_key_value_heads = (
             config.num_key_value_heads // weights.process_group.size()
         )
-        if config.num_attention_heads != config.num_key_value_heads:
-            self.query_key_value = _load_gqa(config, prefix, weights)
-        else:
-            self.query_key_value = TensorParallelColumnLinear.load_multi(
-                config,
-                prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-                dim=0,
-                weights=weights,
-                bias=False,
-            )
-        self.o_proj = TensorParallelRowLinear.load(
+
+        self.query_key_value = load_attention(config, prefix, weights, index)
+        self.index = index
+
+        o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
             bias=False,
         )
+
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            index,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+
         self.num_groups = self.num_heads // self.num_key_value_heads
         self.kv_head_mapping = torch.arange(
             0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
@@ -237,8 +180,9 @@ class FlashLlamaAttention(torch.nn.Module):
         slots,
         input_lengths,
         max_s,
+        adapter_data,
     ):
-        qkv = self.query_key_value(hidden_states)
+        qkv = self.query_key_value(hidden_states, adapter_data)
         query, kv = qkv.split(
             [
                 self.head_size * self.num_heads,
@@ -249,12 +193,9 @@ class FlashLlamaAttention(torch.nn.Module):
         query = query.view(-1, self.num_heads, self.head_size)
         kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
 
-        self.rotary_emb(query, cos, sin)
-        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        vllm_cache_ops.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
-        )
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
 
         # output tensor
         attn_output = torch.empty_like(query)
@@ -273,9 +214,7 @@ class FlashLlamaAttention(torch.nn.Module):
             )
         # Decode
         else:
-            # kv_cache[1] => [num_blocks, num_heads, head_size, block_size]
-            block_size = kv_cache[1].shape[3]
-            vllm_attention_ops.single_query_cached_kv_attention(
+            attn_output = paged_attention(
                 attn_output,
                 query,
                 kv_cache[0],
@@ -284,64 +223,127 @@ class FlashLlamaAttention(torch.nn.Module):
                 self.softmax_scale,
                 block_tables,
                 input_lengths,
-                block_size,
                 max_s,
             )
 
-        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
 
 
 class LlamaMLP(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, weights, index):
         super().__init__()
-        act = config.hidden_act
+        self.hidden_act = config.hidden_act
         self.act = (
-            ACT2FN[act]
-            if "gelu" not in act
+            ACT2FN[self.hidden_act]
+            if "gelu" not in self.hidden_act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh"
+                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
+                    else "none"
+                ),
             )
         )
+        prefixes = None
+        sizes = None
+
         # Fuse gate and up proj
-        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
-            config,
-            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
-            weights=weights,
-            dim=0,
-            bias=False,
+        bias = getattr(config, "mlp_bias", False)
+        if config.model_type == "phi3":
+            gate_up_proj = TensorParallelColumnLinear.load_gate_up(
+                config,
+                prefix=f"{prefix}.gate_up_proj",
+                weights=weights,
+                bias=bias,
+            )
+        else:
+            prefixes = [f"gate_proj", f"up_proj"]
+            sizes = [
+                config.intermediate_size,
+                config.intermediate_size,
+            ]
+            gate_up_proj = TensorParallelColumnLinear.load_multi(
+                config,
+                prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+                weights=weights,
+                dim=0,
+                bias=bias,
+            )
+
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            index,
+            layer_names=prefixes,
+            sizes=sizes,
+            process_group=weights.process_group,
         )
-        self.down_proj = TensorParallelRowLinear.load(
+
+        down_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.down_proj",
             weights=weights,
-            bias=False,
+            bias=bias,
         )
+
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            index,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+
         self.intermediate_size = (
             config.intermediate_size // weights.process_group.size()
         )
 
-    def forward(self, hidden_states):
-        gate_up_states = self.gate_up_proj(hidden_states)
-        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
-        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states, adapter_data):
+        if (
+            SYSTEM == "rocm"
+            and self.hidden_act == "silu"
+            and hidden_states.shape[0] == 1
+            and not self.quantize
+        ):
+            out = torch.empty(
+                hidden_states.shape[0],
+                self.intermediate_size,
+                dtype=hidden_states.dtype,
+                device="cuda",
+            )
+            _custom_C.LLMM_Silu(
+                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
+            )
+            return self.down_proj(out, adapter_data)
+        else:
+            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            return self.down_proj(
+                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+            )
 
 
 class FlashLlamaLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, index, prefix, config, weights):
         super().__init__()
-        prefix = f"model.layers.{layer_id}"
         self.self_attn = FlashLlamaAttention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            index=index,
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+        )
+        self.mlp = LlamaMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, index=index
         )
-        self.mlp = LlamaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
 
-        self.input_layernorm = LlamaRMSNorm(
+        self.input_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
         )
-        self.post_attention_layernorm = LlamaRMSNorm(
+        self.post_attention_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.post_attention_layernorm",
             weights=weights,
             eps=config.rms_norm_eps,
@@ -359,6 +361,7 @@ class FlashLlamaLayer(nn.Module):
         slots,
         input_lengths,
         max_s,
+        adapter_data,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -373,6 +376,7 @@ class FlashLlamaLayer(nn.Module):
             slots,
             input_lengths,
             max_s,
+            adapter_data,
         )
 
         # faster post attention rms norm
@@ -380,33 +384,37 @@ class FlashLlamaLayer(nn.Module):
             attn_output, res
         )
 
-        mlp_output = self.mlp(normed_attn_res_output)
+        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
 
         return mlp_output, attn_res
 
 
 class FlashLlamaModel(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__()
 
         process_group = weights.process_group
         self.tp_rank = process_group.rank()
         self.tp_world_size = process_group.size()
-        self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.embed_tokens", weights=weights
-        )
         self.layers = nn.ModuleList(
             [
                 FlashLlamaLayer(
-                    layer_id,
-                    config,
-                    weights,
+                    index=layer_id,
+                    prefix=(
+                        f"model.layers.{layer_id}"
+                        if not prefix
+                        else f"{prefix}.model.layers.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
                 )
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
-        self.norm = LlamaRMSNorm(
-            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
+        self.norm = FastRMSNorm.load(
+            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
+            weights=weights,
+            eps=config.rms_norm_eps,
         )
 
         self.gradient_checkpointing = False
@@ -417,7 +425,7 @@ class FlashLlamaModel(torch.nn.Module):
 
     def forward(
         self,
-        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
         position_ids: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
@@ -425,8 +433,11 @@ class FlashLlamaModel(torch.nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
 
         # Get rotary cos and sin for this forward
         # Avoid to index in each layer
@@ -447,6 +458,7 @@ class FlashLlamaModel(torch.nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                adapter_data,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -455,13 +467,24 @@ class FlashLlamaModel(torch.nn.Module):
 
 
 class FlashLlamaForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
 
-        self.model = FlashLlamaModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=(
+                "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens"
+            ),
+            weights=weights,
+        )
+        self.model = FlashLlamaModel(prefix, config, weights)
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
+        self.lm_head = SpeculativeHead.load(
             config,
-            prefix="lm_head",
+            prefix=suffix if not prefix else f"{prefix}.{suffix}",
             weights=weights,
         )
 
@@ -475,10 +498,13 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
         lm_head_indices: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = self.model(
-            input_ids,
+            inputs_embeds,
             position_ids,
             cu_seqlen_prefill,
             kv_cache,
@@ -486,8 +512,11 @@ class FlashLlamaForCausalLM(torch.nn.Module):
             slots,
             input_lengths,
             max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
-        logits = self.lm_head(hidden_states)
-        return logits
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
new file mode 100644
index 00000000..8028dbe8
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers.attention import (
+    Seqlen,
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+    TensorParallelMultiAdapterLinear,
+    TensorParallelAdapterRowLinear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+if SYSTEM == "rocm":
+    try:
+        from vllm import _custom_C
+    except Exception as e:
+        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
+
+
+class MistralConfig(PretrainedConfig):
+    model_type = "mistral"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MistralAttention(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, layer_id):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        query_key_value = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+        head_size = config.hidden_size // config.num_attention_heads
+        self.query_key_value = TensorParallelMultiAdapterLinear.load(
+            query_key_value,
+            layer_id,
+            ["q_proj", "k_proj", "v_proj"],
+            sizes=[
+                head_size * config.num_attention_heads,
+                head_size * config.num_key_value_heads,
+                head_size * config.num_key_value_heads,
+            ],
+            process_group=weights.process_group,
+        )
+
+        o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelAdapterRowLinear.load(
+            o_proj,
+            layer_id,
+            "o_proj",
+            process_group=weights.process_group,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+        adapter_data,
+    ):
+        qkv = self.query_key_value(hidden_states, adapter_data)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(
+            attn_output.view(-1, self.num_heads * self.head_size), adapter_data
+        )
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, prefix: str, config, weights, layer_id):
+        super().__init__()
+        self.hidden_act = config.hidden_act
+        self.act = (
+            ACT2FN[self.hidden_act]
+            if "gelu" not in self.hidden_act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh"
+                    if self.hidden_act in ["gelu_fast", "gelu_pytorch_tanh"]
+                    else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.gate_up_proj = TensorParallelMultiAdapterLinear.load(
+            gate_up_proj,
+            layer_id,
+            ["gate_proj", "up_proj"],
+            sizes=[
+                config.intermediate_size,
+                config.intermediate_size,
+            ],
+            process_group=weights.process_group,
+        )
+
+        down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+        self.down_proj = TensorParallelAdapterRowLinear.load(
+            down_proj,
+            layer_id,
+            "down_proj",
+            process_group=weights.process_group,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+        # TODO: This is a hotfix to be removed & properly refactored.
+        self.quantize = config.quantize
+
+    def forward(self, hidden_states, adapter_data):
+        if (
+            SYSTEM == "rocm"
+            and self.hidden_act == "silu"
+            and hidden_states.shape[0] == 1
+            and not self.quantize
+        ):
+            out = torch.empty(
+                hidden_states.shape[0],
+                self.intermediate_size,
+                dtype=hidden_states.dtype,
+                device="cuda",
+            )
+            _custom_C.LLMM_Silu(
+                self.gate_up_proj.base_layer.linear.weight, hidden_states, out, 8
+            )
+            return self.down_proj(out, adapter_data)
+        else:
+            gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
+            gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+            return self.down_proj(
+                self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
+            )
+
+
+class MistralLayer(nn.Module):
+    def __init__(self, prefix: str, config, weights, layer_id):
+        super().__init__()
+        self.self_attn = MistralAttention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            layer_id=layer_id,
+        )
+        self.mlp = MistralMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights, layer_id=layer_id
+        )
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+        adapter_data,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices,
+            adapter_data,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output, adapter_data)
+
+        return mlp_output, attn_res
+
+
+class MistralModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                MistralLayer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    layer_id=layer_id,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        adapter_data: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+                prefill_cache_indices,
+                adapter_data,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class FlashMistralForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, name=None):
+        if name is None:
+            name = "model"
+        super().__init__()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=(
+                f"{name}.embed_tokens"
+                if not prefix
+                else f"{prefix}.{name}.embed_tokens"
+            ),
+            weights=weights,
+        )
+        self.model = MistralModel(
+            prefix=name if not prefix else f"{prefix}.{name}",
+            config=config,
+            weights=weights,
+        )
+        self.lm_head = SpeculativeHead.load(
+            config,
+            # TODO dirty hack for idefics2.
+            prefix=(
+                "lm_head" if not prefix or name != "model" else f"{prefix}.lm_head"
+            ),
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            inputs_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+            adapter_data,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
new file mode 100644
index 00000000..429793ea
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -0,0 +1,667 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+import numpy as np
+
+from torch import nn
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM != "ipex":
+    from vllm.model_executor.layers.fused_moe import fused_moe
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+from loguru import logger
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    FastLinear,
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+
+
+class MixtralConfig(PretrainedConfig):
+    model_type = "mixtral"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=None,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if len(x.size()) == 0 else x
+
+
+def load_attention(config, prefix: str, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=None, quantize=config.quantize)
+    )
+
+
+def _load_experts(config, prefix: str, mat, weights):
+    if config.quantize is not None:
+        raise NotImplementedError("Mixtral does not support weight quantization yet.")
+
+    assert mat in ["w1", "w2", "w3"]
+
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    assert (
+        config.intermediate_size % world_size == 0
+    ), f"The chosen size {config.intermediate_size} is not compatible with sharding on {world_size} shards"
+
+    block_size = config.intermediate_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+
+    tensor = torch.empty(
+        (config.num_local_experts * block_size, config.hidden_size),
+        dtype=weights.dtype,
+        device=weights.device,
+    )
+
+    for i in range(config.num_local_experts):
+        slice_ = weights._get_slice(f"{prefix}.{i}.{mat}.weight")
+
+        if mat == "w2":
+            expert_slice = slice_[:, start:stop].t().contiguous()
+        else:
+            expert_slice = slice_[start:stop]
+        tensor[i * block_size : (i + 1) * block_size] = expert_slice.to(
+            dtype=weights.dtype
+        ).to(device=weights.device)
+    return tensor
+
+
+class MixtralAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+@torch.jit.script
+def select_experts(gate_logits: torch.Tensor, top_k: int):
+    # all_probs: (sequence_length, n_experts) and upcast for softmax
+    all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+    # weights, selected_experts: (sequence_length, top-k)
+    weights, selected_experts = torch.topk(all_probs, top_k, dim=-1)
+    weights /= weights.sum(dim=-1, keepdim=True)
+    weights = weights.view(-1)
+    selected_experts = selected_experts.view(-1)
+
+    return selected_experts, weights
+
+
+@torch.jit.script
+def round_up(x: torch.Tensor, value: int):
+    return torch.div(x + (value - 1), value, rounding_mode="trunc") * value
+
+
+class BlockSparseMoE(nn.Module):
+    def __init__(self, prefix, config: MixtralConfig, weights):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size // weights.process_group.size()
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+
+        act = config.hidden_act
+        if "gelu" in act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        elif "silu" in act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[act]
+
+        # gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        # merged expert weights, all of size  (n_experts * ffn_dim, hidden_dim)
+        w1 = _load_experts(config, f"{prefix}.experts", "w1", weights).view(
+            self.num_experts, self.ffn_dim, self.hidden_dim
+        )
+        w3 = _load_experts(config, f"{prefix}.experts", "w3", weights).view(
+            self.num_experts, self.ffn_dim, self.hidden_dim
+        )
+        self.w13 = torch.cat([w1, w3], dim=1)
+        self.w2 = (
+            _load_experts(config, f"{prefix}.experts", "w2", weights)
+            .view(self.num_experts, self.ffn_dim, self.hidden_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(x)
+        out = fused_moe(
+            x,
+            self.w13,
+            self.w2,
+            router_logits,
+            self.top_k,
+            renormalize=True,
+            inplace=True,
+        )
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out.view(*x.shape)
+
+
+class DenseMoE(nn.Module):
+    def __init__(self, prefix, config: MixtralConfig, weights):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size // weights.process_group.size()
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+
+        act = config.hidden_act
+        if "gelu" in act:
+            self.act = lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        elif "silu" in act:
+            self.act = torch.nn.functional.silu
+        else:
+            self.act = ACT2FN[act]
+
+        # gating
+        self.gate = FastLinear.load(config, f"{prefix}.gate", weights, bias=False)
+
+        self.w1 = [
+            TensorParallelColumnLinear.load(
+                config, prefix=f"{prefix}.experts.{i}.w1", weights=weights, bias=False
+            )
+            for i in range(self.num_experts)
+        ]
+        self.w3 = [
+            TensorParallelColumnLinear.load(
+                config, prefix=f"{prefix}.experts.{i}.w3", weights=weights, bias=False
+            )
+            for i in range(self.num_experts)
+        ]
+        self.w2 = [
+            TensorParallelRowLinear.load(
+                config, prefix=f"{prefix}.experts.{i}.w2", weights=weights, bias=False
+            )
+            for i in range(self.num_experts)
+        ]
+
+        self.process_group = weights.process_group
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: (sequence_length, model_dim)
+        gate_logits: (sequence_length, n_experts)
+        """
+        # optional reshape
+        input_shape = x.shape
+        x = x.view(-1, input_shape[-1])
+
+        # gate_logits: (sequence_length, n_experts)
+        gate_logits = self.gate(x)
+        # all_probs: (sequence_length, n_experts) and upcast for softmax
+        all_probs = torch.nn.functional.softmax(gate_logits, dim=1, dtype=torch.float)
+
+        if self.top_k < self.num_experts:
+            _, not_selected_experts = torch.topk(
+                all_probs,
+                self.num_experts - self.top_k,
+                largest=False,
+                sorted=False,
+                dim=1,
+            )
+            # Mask not selected experts
+            all_probs.scatter_(1, not_selected_experts, 0)
+
+        # Re-normalize
+        weights = all_probs / all_probs.sum(dim=1, keepdim=True)
+        weights = weights.to(x.dtype)
+
+        # Final output tensor
+        out = x.new_zeros(x.shape[0], self.hidden_dim)
+        for i in range(self.num_experts):
+            h = self.act(self.w1[i](x)) * self.w3[i](x)
+            h = self.w2[i](h, reduce=False)
+            # Add expert output to out with masking
+            out += h * weights[:, i].view(-1, 1)
+
+        # Reduce sum
+        if self.process_group.size() > 1:
+            torch.distributed.all_reduce(out, group=self.process_group)
+
+        return out
+
+
+class MixtralLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+
+        self.self_attn = MixtralAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+
+        moe_cls = BlockSparseMoE if config.quantize is None else DenseMoE
+        self.moe = moe_cls(f"{prefix}.block_sparse_moe", config, weights)
+
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        moe_output = self.moe(normed_attn_res_output)
+
+        return moe_output, attn_res
+
+
+class MixtralModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=(
+                "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens"
+            ),
+            weights=weights,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                MixtralLayer(
+                    "model" if not prefix else f"{prefix}.model",
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix="model.norm" if not prefix else f"{prefix}.model.norm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashMixtralForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        self.model = MixtralModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head" if not prefix else f"{prefix}.lm_head",
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index 9dc374df..0eca181b 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -27,20 +27,24 @@ from transformers.modeling_utils import PreTrainedModel
 from transformers.models.gpt_neox import GPTNeoXConfig
 from typing import Optional, List, Tuple
 
-# vllm imports
-import vllm_cache_ops
-import vllm_attention_ops
-
-from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    TensorParallelHead,
-    FastLayerNorm,
-    PositionRotaryEmbedding,
+    SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
 
 
 def load_row(config, prefix: str, weights, bias: bool):
@@ -94,6 +98,8 @@ class FlashNeoxAttention(torch.nn.Module):
         self.hidden_size = hidden_size
         self.head_size = hidden_size // num_heads
 
+        self.rotary_dim = int(config.rotary_pct * self.head_size)
+
         if self.num_heads % weights.process_group.size() != 0:
             raise ValueError(
                 f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
@@ -101,8 +107,11 @@ class FlashNeoxAttention(torch.nn.Module):
             )
         self.num_heads = self.num_heads // weights.process_group.size()
 
-        self.rotary_emb = PositionRotaryEmbedding.load(
-            config=config, prefix=f"{prefix}.rotary_emb", weights=weights
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=config.rotary_emb_base,
+            device=weights.device,
         )
 
         self.softmax_scale = self.head_size ** (-0.5)
@@ -138,12 +147,9 @@ class FlashNeoxAttention(torch.nn.Module):
         qkv = qkv.view(-1, 3, self.num_heads, self.head_size)
 
         # Inplace rotary
-        self.rotary_emb(qkv[:, 0], cos, sin)
-        self.rotary_emb(qkv[:, 1], cos, sin)
+        self.rotary_emb(qkv[:, 0], qkv[:, 1], cos, sin)
 
-        vllm_cache_ops.reshape_and_cache(
-            qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots
-        )
+        reshape_and_cache(qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots)
 
         # output tensor
         attn_output = torch.empty_like(qkv[:, 0])
@@ -162,9 +168,7 @@ class FlashNeoxAttention(torch.nn.Module):
             )
         # Decode
         else:
-            # kv_cache[1] => [num_blocks, num_heads, head_size, block_size]
-            block_size = kv_cache[1].shape[3]
-            vllm_attention_ops.single_query_cached_kv_attention(
+            attn_output = paged_attention(
                 attn_output,
                 qkv[:, 0],
                 kv_cache[0],
@@ -173,7 +177,6 @@ class FlashNeoxAttention(torch.nn.Module):
                 self.softmax_scale,
                 block_tables,
                 input_lengths,
-                block_size,
                 max_s,
             )
 
@@ -189,9 +192,9 @@ class FlashMLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
 
@@ -302,12 +305,12 @@ class FlashGPTNeoXPreTrainedModel(PreTrainedModel):
 
 
 class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.config = config
 
         self.embed_in = TensorParallelEmbedding(
-            prefix="gpt_neox.embed_in", weights=weights
+            prefix=f"{prefix}.embed_in", weights=weights
         )
 
         self.layers = nn.ModuleList(
@@ -317,7 +320,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
             ]
         )
         self.final_layer_norm = FastLayerNorm.load(
-            prefix="gpt_neox.final_layer_norm",
+            prefix=f"{prefix}.final_layer_norm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
@@ -367,11 +370,17 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
 
 
 class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__(config)
-        self.gpt_neox = FlashGPTNeoXModel(config, weights)
 
-        self.embed_out = TensorParallelHead.load(
+        if not prefix:
+            prefix = "gpt_neox"
+        else:
+            prefix = f"{prefix}.gpt_neox"
+
+        self.gpt_neox = FlashGPTNeoXModel(prefix, config, weights)
+
+        self.embed_out = SpeculativeHead.load(
             config, prefix="embed_out", weights=weights
         )
 
@@ -385,7 +394,9 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.gpt_neox(
             input_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
new file mode 100644
index 00000000..1f998e5a
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.tensor_parallel import TensorParallelColumnLinear
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+    load_vision_model,
+)
+
+
+class PaliGemmaForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = config.quantize
+        self.vision_tower = load_vision_model(
+            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
+            config=config.vision_config,
+            weights=weights,
+        )
+
+        self.multi_modal_projector = TensorParallelColumnLinear.load(
+            config,
+            prefix="multi_modal_projector.linear",
+            weights=weights,
+            bias=True,
+        )
+
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        text_config = config.text_config
+        text_config.speculator = config.speculator
+        text_config.quantize = config.quantize
+        self.text_model = load_text_model(
+            prefix="language_model" if not prefix else f"{prefix}.language_model",
+            config=config.text_config,
+            weights=weights,
+        )
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        # Unused here
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        # TODO This is odd but apparently pali gemma position ids start at 1.
+        if cu_seqlen_prefill is not None:
+            max_s += 1
+            position_ids += 1
+
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(dtype=inputs_embeds.dtype)
+            image_outputs = self.vision_tower(pixel_values)
+            image_features = self.multi_modal_projector(image_outputs.last_hidden_state)
+
+            # mask where image or padding tokens
+            mask = input_ids == self.config.image_token_index
+
+            # insert image features into input embeddings
+            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            input_lengths=input_lengths,
+            max_s=max_s,
+        )
+
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
new file mode 100644
index 00000000..7401bc27
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -0,0 +1,424 @@
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+
+
+class PhiConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=51200,
+        hidden_size=2560,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="gelu_fast",  # llama uses silu
+        layer_norm_eps=1e-05,  # rms in llama,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        resid_pdrop=0.1,  # llama doesn't have this
+        partial_rotary_factor=0.5,  # important difference between llama and phi
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.rope_theta = rope_theta
+        self.resid_pdrop = resid_pdrop
+        self.partial_rotary_factor = partial_rotary_factor
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+# this is the same as llama except for Phi uses bias=True
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    # this is the same as llama except for Phi uses bias=True
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=True, quantize=config.quantize)
+    )
+
+
+class FlashPhiAttention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.softmax_scale = self.head_size**-0.5
+        self.rotary_dim = int(config.partial_rotary_factor * self.head_size)
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.rotary_dim,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        # in llama the dense layer is called "o_proj" and has bias=False
+        self.dense = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.dense",
+            weights=weights,
+            bias=True,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        # Compute query, key, value and split
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+
+        # Reshape query and key for rotary embeddings
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        # NOTE: this is the main difference between Llama and Phi
+        # in llama the rotary embeddings are applied to the whole query and key.
+        # Phi uses PARTIAL rotary embeddings, which are applied to the first 32 dimensions
+        #
+        # Apply partial positional embeddings in place
+        self.rotary_emb(
+            query[:, :, : self.rotary_dim], kv[:, 0, :, : self.rotary_dim], cos, sin
+        )
+
+        # Reshape key and value and cache
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class PhiMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        # llama weights are up_proj and down_proj and bias=False
+        self.up_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.fc1",
+            weights=weights,
+            bias=True,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.fc2",
+            weights=weights,
+            bias=True,
+        )
+
+    def forward(self, hidden_states):
+        # NOTE: Llama requires the gate up states to an intermediate size
+        # Phi does not and we can avoid the `view` operation
+        return self.down_proj(self.act(self.up_proj(hidden_states)))
+
+
+class FlashPhiLayer(nn.Module):
+    def __init__(self, prefix: str, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+        self.self_attn = FlashPhiAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = PhiMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.input_layernorm = FastLayerNorm.load(
+            prefix=f"{prefix}.input_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+        self.resid_dropout = torch.nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        hidden_states, res = self.input_layernorm(hidden_states, residual)
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        hidden_states = self.resid_dropout(attn_output).add(
+            self.resid_dropout(self.mlp(hidden_states))
+        )
+
+        return hidden_states, res
+
+
+class FlashPhiModel(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                FlashPhiLayer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+        self.norm = FastLayerNorm.load(
+            prefix="model.final_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashPhiForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = FlashPhiModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+
+        return self.lm_head(hidden_states)
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
new file mode 100644
index 00000000..a98709c5
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -0,0 +1,394 @@
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    return TensorParallelColumnLinear.load_multi(
+        config,
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        dim=0,
+        weights=weights,
+        bias=True,
+    )
+
+
+class Qwen2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class Qwen2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class Qwen2Layer(nn.Module):
+    def __init__(self, prefix, layer_id, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.layers.{layer_id}"
+        self.self_attn = Qwen2Attention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.mlp = Qwen2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.input_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class Qwen2Model(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen2Layer(
+                    prefix,
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class Qwen2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = Qwen2Model(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="lm_head",
+            weights=weights,
+        )
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index 14caa23d..d12ed567 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -1,25 +1,25 @@
+from typing import List, Optional, Tuple
+
 import torch
 import torch.distributed
-
 from torch import nn
-from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
-from typing import Optional, List, Tuple
+from transformers.modeling_utils import PreTrainedModel
 
-# vllm imports
-import vllm_cache_ops
-import vllm_attention_ops
-
-from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
-    TensorParallelRowLinear,
+from text_generation_server.layers import (
+    SpeculativeHead,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
-    TensorParallelHead,
-    FastLayerNorm,
-    PositionRotaryEmbedding,
+    TensorParallelRowLinear,
     get_linear,
 )
+from text_generation_server.layers.layernorm import FastLayerNorm
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.attention import (
+    attention,
+    paged_attention,
+    reshape_and_cache,
+)
 
 
 def load_row(config, prefix: str, weights, bias: bool):
@@ -51,6 +51,7 @@ class RWConfig(PretrainedConfig):
         hidden_size=64,
         num_hidden_layers=None,
         num_attention_heads=None,
+        num_ln_in_prallel_attention=None,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         use_cache=True,
@@ -64,6 +65,7 @@ class RWConfig(PretrainedConfig):
         new_decoder_architecture=None,
         bias=False,
         parallel_attn=False,
+        rope_theta=10_000.0,
         **kwargs,
     ):
         if alibi:
@@ -74,6 +76,7 @@ class RWConfig(PretrainedConfig):
         self.model_type = model_type
         self.alibi = False
         self.rotary = True
+        self.rope_theta = rope_theta
 
         self.vocab_size = vocab_size
         # Backward compatibility with n_embed kwarg
@@ -90,6 +93,7 @@ class RWConfig(PretrainedConfig):
             else kwargs.pop("n_head", 8)
         )
         self.layer_norm_epsilon = layer_norm_epsilon
+        self.num_ln_in_parallel_attention = num_ln_in_prallel_attention
         self.initializer_range = initializer_range
         self.use_cache = use_cache
         self.hidden_dropout = hidden_dropout
@@ -123,7 +127,7 @@ class FlashRWAttention(torch.nn.Module):
     def __init__(
         self,
         config,
-        prefix,
+        prefix: str,
         weights,
     ):
         super().__init__()
@@ -131,9 +135,13 @@ class FlashRWAttention(torch.nn.Module):
         self.num_heads_kv = config.n_head_kv
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
+        self.rope_theta = config.rope_theta
 
         self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=10000.0, device=weights.device
+            config=config,
+            dim=self.head_size,
+            base=self.rope_theta,
+            device=weights.device,
         )
         self.softmax_scale = self.head_size ** (-0.5)
 
@@ -188,12 +196,9 @@ class FlashRWAttention(torch.nn.Module):
         kv = kv.view(-1, 2, self.num_heads_kv, self.head_size)
 
         # Inplace rotary
-        self.rotary_emb(query, cos, sin)
-        self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        vllm_cache_ops.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
-        )
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
 
         # output
         attn_output = torch.empty_like(query)
@@ -212,9 +217,7 @@ class FlashRWAttention(torch.nn.Module):
             )
         # Decode
         else:
-            # kv_cache[1] => [num_blocks, num_heads_kv, head_size, block_size]
-            block_size = kv_cache[1].shape[3]
-            vllm_attention_ops.single_query_cached_kv_attention(
+            attn_output = paged_attention(
                 attn_output,
                 query,
                 kv_cache[0],
@@ -223,7 +226,6 @@ class FlashRWAttention(torch.nn.Module):
                 self.softmax_scale,
                 block_tables,
                 input_lengths,
-                block_size,
                 max_s,
             )
 
@@ -234,36 +236,43 @@ class FlashRWLargeAttention(torch.nn.Module):
     def __init__(
         self,
         config,
-        prefix,
+        prefix: str,
         weights,
     ):
         super().__init__()
 
         hidden_size = config.hidden_size
         num_heads = config.n_head
-        num_heads_kv = config.n_head_kv
+        # num_heads_kv = config.n_head_kv
+        num_groups = config.n_head_kv
 
         self.hidden_size = hidden_size
         self.head_size = hidden_size // num_heads
+        self.num_groups = num_groups
+        self.rope_theta = config.rope_theta
 
         self.rotary_emb = PositionRotaryEmbedding.static(
-            config=config, dim=self.head_size, base=10000.0, device=weights.device
+            config=config,
+            dim=self.head_size,
+            base=self.rope_theta,
+            device=weights.device,
         )
         self.softmax_scale = self.head_size ** (-0.5)
 
-        self.num_groups = num_heads // (num_heads_kv * 2)
+        # self.num_groups = num_heads // (num_heads_kv * 2)
         self.num_heads = num_heads // self.num_groups
-        self.num_heads_kv = num_heads_kv // self.num_groups
+        # self.num_heads_kv = num_heads_kv // self.num_groups
         process_group = weights.process_group
 
         if process_group.size() > self.num_groups:
             raise NotImplementedError(
-                f"Tensor Parallelism is not implemented for world_size > n groups"
+                "Tensor Parallelism is not implemented for world_size > n groups"
             )
         if self.num_groups % process_group.size() != 0:
             raise NotImplementedError(
                 f"Tensor Parallelism is not implemented for {self.num_groups} not divisible by {process_group.size()}"
             )
+
         self.num_groups = self.num_groups // process_group.size()
 
         self.query_key_value = TensorParallelColumnLinear.load(
@@ -304,10 +313,9 @@ class FlashRWLargeAttention(torch.nn.Module):
         query = query.reshape(-1, self.num_groups * self.num_heads, self.head_size)
 
         # Inplace rotary
-        self.rotary_emb(query, cos, sin)
-        self.rotary_emb(torch.select(kv, dim=2, index=0), cos, sin)
+        self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)
 
-        vllm_cache_ops.reshape_and_cache(
+        reshape_and_cache(
             kv[:, :, 0].contiguous(),
             kv[:, :, 1].contiguous(),
             kv_cache[0],
@@ -332,9 +340,7 @@ class FlashRWLargeAttention(torch.nn.Module):
             )
         # Decode
         else:
-            # kv_cache[1] => [num_blocks, num_groups, head_size, block_size]
-            block_size = kv_cache[1].shape[3]
-            vllm_attention_ops.single_query_cached_kv_attention(
+            attn_output = paged_attention(
                 attn_output,
                 query,
                 kv_cache[0],
@@ -343,7 +349,6 @@ class FlashRWLargeAttention(torch.nn.Module):
                 self.softmax_scale,
                 block_tables,
                 input_lengths,
-                block_size,
                 max_s,
             )
 
@@ -353,7 +358,7 @@ class FlashRWLargeAttention(torch.nn.Module):
 
 
 class FlashMLP(nn.Module):
-    def __init__(self, config, prefix, weights):
+    def __init__(self, config, prefix: str, weights):
         super().__init__()
         self.act = torch.nn.functional.gelu
 
@@ -375,6 +380,7 @@ class FlashRWLayer(nn.Module):
     def __init__(
         self,
         layer_id,
+        prefix: str,
         config,
         weights,
     ):
@@ -383,7 +389,7 @@ class FlashRWLayer(nn.Module):
         parallel_attn = config.parallel_attn
         self.parallel_attn = parallel_attn
 
-        prefix = f"transformer.h.{layer_id}"
+        prefix = f"{prefix}.h.{layer_id}"
 
         self.input_layernorm = FastLayerNorm.load(
             prefix=f"{prefix}.input_layernorm",
@@ -463,29 +469,61 @@ class FlashRWLayer(nn.Module):
                 max_s,
             )
 
-            hidden_states, residual = self.post_attention_layernorm(
-                hidden_states, residual
-            )
+            if self.post_attention_layernorm is not None:
+                hidden_states, residual = self.post_attention_layernorm(
+                    hidden_states, residual
+                )
 
             mlp_output = self.mlp(hidden_states)
 
             return mlp_output, residual
 
 
-class FlashRWLargeLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+class FlashRWLayerNorm(nn.Module):
+    def __init__(self, config, prefix: str, weights):
         super().__init__()
-        prefix = f"transformer.h.{layer_id}"
-        self.ln_attn = FastLayerNorm.load(
-            prefix=f"{prefix}.ln_attn",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
-        self.ln_mlp = FastLayerNorm.load(
-            prefix=f"{prefix}.ln_mlp",
-            weights=weights,
-            eps=config.layer_norm_epsilon,
-        )
+        self.num_ln = config.num_ln_in_parallel_attn
+
+        if self.num_ln == 1:
+            self.input_ln = FastLayerNorm.load(
+                prefix=f"{prefix}.input_layernorm",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+        elif self.num_ln == 2:
+            self.ln_attn = FastLayerNorm.load(
+                prefix=f"{prefix}.ln_attn",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+            self.ln_mlp = FastLayerNorm.load(
+                prefix=f"{prefix}.ln_mlp",
+                weights=weights,
+                eps=config.layer_norm_epsilon,
+            )
+        else:
+            raise ValueError("Number of layer norms can either be 1 or 2.")
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+    ):
+        if self.num_ln == 1:
+            ln_hidden_states, residual = self.input_ln(hidden_states, residual)
+            return ln_hidden_states, ln_hidden_states, residual
+        elif self.num_ln == 2:
+            ln_attn, residual = self.ln_attn(hidden_states, residual)
+            ln_mlp, _ = self.ln_mlp(residual)
+            return ln_attn, ln_mlp, residual
+
+
+class FlashRWLargeLayer(nn.Module):
+    def __init__(self, layer_id, prefix: str, config, weights):
+        super().__init__()
+        prefix = f"{prefix}.h.{layer_id}"
+
+        self.ln_layer = FlashRWLayerNorm(config, prefix, weights)
 
         self.self_attention = FlashRWLargeAttention(
             config,
@@ -511,8 +549,8 @@ class FlashRWLargeLayer(nn.Module):
         input_lengths,
         max_s,
     ):
-        ln_attn, residual = self.ln_attn(hidden_states, residual)
-        ln_mlp, _ = self.ln_mlp(residual)
+        # Layer norm.
+        ln_attn, ln_mlp, residual = self.ln_layer(hidden_states, residual)
 
         # Self attention.
         attn_output = self.self_attention(
@@ -543,18 +581,18 @@ class FlashRWPreTrainedModel(PreTrainedModel):
 
 
 class FlashRWModel(FlashRWPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.config = config
 
         self.word_embeddings = TensorParallelEmbedding(
-            prefix="transformer.word_embeddings", weights=weights
+            prefix=f"{prefix}.word_embeddings", weights=weights
         )
 
         if config.new_decoder_architecture:
             self.h = nn.ModuleList(
                 [
-                    FlashRWLargeLayer(layer_id, config, weights)
+                    FlashRWLargeLayer(layer_id, prefix, config, weights)
                     for layer_id in range(config.num_hidden_layers)
                 ]
             )
@@ -562,14 +600,14 @@ class FlashRWModel(FlashRWPreTrainedModel):
         else:
             self.h = nn.ModuleList(
                 [
-                    FlashRWLayer(layer_id, config, weights)
+                    FlashRWLayer(layer_id, prefix, config, weights)
                     for layer_id in range(config.num_hidden_layers)
                 ]
             )
             self.cache_size = self.h[0].self_attention.num_heads_kv
 
         self.ln_f = FastLayerNorm.load(
-            prefix="transformer.ln_f",
+            prefix=f"{prefix}.ln_f",
             weights=weights,
             eps=config.layer_norm_epsilon,
         )
@@ -616,14 +654,17 @@ class FlashRWModel(FlashRWPreTrainedModel):
 
 
 class FlashRWForCausalLM(FlashRWPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
 
-        self.transformer = FlashRWModel(config, weights)
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
 
-        self.lm_head = TensorParallelHead.load(
-            config, prefix="lm_head", weights=weights
-        )
+        self.transformer = FlashRWModel(prefix, config, weights)
+
+        self.lm_head = SpeculativeHead.load(config, prefix="lm_head", weights=weights)
 
     def forward(
         self,
@@ -635,7 +676,9 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(
             input_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 2dd0a5ee..21a22046 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -5,20 +5,21 @@ from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
 
-# vllm imports
-import vllm_cache_ops
-import vllm_attention_ops
-
-from text_generation_server.utils.flash_attn import attention
-from text_generation_server.utils.layers import (
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
     TensorParallelRowLinear,
     TensorParallelColumnLinear,
-    TensorParallelHead,
+    SpeculativeHead,
     TensorParallelEmbedding,
-    FastLayerNorm,
     get_linear,
 )
-from safetensors import SafetensorError
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
 
 
 def load_multi_mqa(
@@ -28,6 +29,10 @@ def load_multi_mqa(
         return _load_multi_mqa_gptq(
             config, prefix, weights, bias, head_size, num_heads, hidden_size
         )
+    elif config.quantize == "marlin":
+        raise RuntimeError(
+            "santacoder models with marlin quantization are not yet supported"
+        )
     else:
         return _load_multi_mqa(
             config, prefix, weights, bias, head_size, num_heads, hidden_size
@@ -37,6 +42,8 @@ def load_multi_mqa(
 def _load_multi_mqa_gptq(
     config, prefix: str, weights, bias: bool, head_size, num_heads, hidden_size
 ):
+    from text_generation_server.layers.gptq import GPTQWeight
+
     if any("c_attn" in k for k in weights.routing.keys()) and not config.transpose:
         world_size = weights.process_group.size()
         rank = weights.process_group.rank()
@@ -74,14 +81,29 @@ def _load_multi_mqa_gptq(
         qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
         qzeros = qzeros.to(device=weights.device)
 
-        g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
-        g_idx = g_idx.to(device=weights.device)
-        bits, groupsize = weights._get_gptq_params()
+        gptq_params = weights._get_gptq_params()
+        if gptq_params.quant_method == "gptq":
+            g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
+            g_idx = g_idx.to(device=weights.device)
+        elif gptq_params.quant_method == "awq":
+            g_idx = None
+            from text_generation_server.layers.awq.conversion_utils import (
+                fast_awq_to_gptq,
+            )
 
-        from text_generation_server.utils.layers import HAS_EXLLAMA
+            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
 
-        use_exllama = HAS_EXLLAMA
-        weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
+        from text_generation_server.layers.gptq import HAS_EXLLAMA
+
+        weight = GPTQWeight(
+            qweight=qweight,
+            qzeros=qzeros,
+            scales=scales,
+            g_idx=g_idx,
+            bits=gptq_params.bits,
+            groupsize=gptq_params.groupsize,
+            use_exllama=HAS_EXLLAMA,
+        )
 
         if bias:
             slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
@@ -258,7 +280,7 @@ class FlashMQAttention(torch.nn.Module):
         query = query.view(-1, self.num_heads, self.head_size)
         key_value = key_value.view(-1, 2, 1, self.head_size)
 
-        vllm_cache_ops.reshape_and_cache(
+        reshape_and_cache(
             key_value[:, 0], key_value[:, 1], kv_cache[0], kv_cache[1], slots
         )
 
@@ -279,9 +301,7 @@ class FlashMQAttention(torch.nn.Module):
             )
         # Decode
         else:
-            # kv_cache[1] => [num_blocks, 1, head_size, block_size]
-            block_size = kv_cache[1].shape[3]
-            vllm_attention_ops.single_query_cached_kv_attention(
+            attn_output = paged_attention(
                 attn_output,
                 query,
                 kv_cache[0],
@@ -290,7 +310,6 @@ class FlashMQAttention(torch.nn.Module):
                 self.softmax_scale,
                 block_tables,
                 input_lengths,
-                block_size,
                 max_s,
             )
 
@@ -306,9 +325,9 @@ class MLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
 
@@ -327,16 +346,16 @@ class MLP(nn.Module):
 
 
 class Block(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix: str, layer_id, config, weights):
         super().__init__()
-        prefix = f"transformer.h.{layer_id}"
+        prefix = f"{prefix}.h.{layer_id}"
         self.ln_1 = FastLayerNorm.load(
             prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
         )
         self.ln_2 = FastLayerNorm.load(
             prefix=f"{prefix}.ln_2", weights=weights, eps=config.layer_norm_epsilon
         )
-        self.attn = FlashMQAttention(
+        self.self_attn = FlashMQAttention(
             prefix=f"{prefix}.attn",
             config=config,
             weights=weights,
@@ -359,7 +378,7 @@ class Block(nn.Module):
         max_s,
     ):
         hidden_states, residual = self.ln_1(hidden_states, residual)
-        hidden_states = self.attn(
+        hidden_states = self.self_attn(
             hidden_states,
             cu_seqlen_prefill,
             kv_cache,
@@ -377,25 +396,26 @@ class Block(nn.Module):
 
 
 class FlashSantacoderModel(nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__()
         self.config = config
 
         self.process_group = weights.process_group
         self.wte = TensorParallelEmbedding(
-            prefix="transformer.wte",
+            prefix=f"{prefix}.wte",
             weights=weights,
             reduce=False,
         )
         self.wpe = TensorParallelEmbedding(
-            prefix="transformer.wpe",
+            prefix=f"{prefix}.wpe",
             weights=weights,
             reduce=False,
         )
 
-        self.h = nn.ModuleList(
+        self.layers = nn.ModuleList(
             [
                 Block(
+                    prefix,
                     layer_id,
                     config,
                     weights,
@@ -407,8 +427,8 @@ class FlashSantacoderModel(nn.Module):
             prefix="transformer.ln_f", weights=weights, eps=config.layer_norm_epsilon
         )
 
-        self.head_size = self.h[0].attn.head_size
-        self.num_heads = self.h[0].attn.num_heads
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
 
     def forward(
         self,
@@ -427,7 +447,7 @@ class FlashSantacoderModel(nn.Module):
             torch.distributed.all_reduce(hidden_states, group=self.process_group)
 
         residual = None
-        for i, layer in enumerate(self.h):
+        for i, layer in enumerate(self.layers):
             hidden_states, residual = layer(
                 hidden_states,
                 residual,
@@ -445,11 +465,18 @@ class FlashSantacoderModel(nn.Module):
 
 
 class FlashSantacoderForCausalLM(nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__()
-        self.transformer = FlashSantacoderModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
-            config, prefix="transformer.wte", weights=weights
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        config.transpose = config.architectures[0].startswith("GPT2")
+        self.model = FlashSantacoderModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config, prefix=f"{prefix}.wte", weights=weights
         )
 
     def forward(
@@ -462,9 +489,11 @@ class FlashSantacoderForCausalLM(nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
         lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.transformer(
+        hidden_states = self.model(
             input_ids,
             position_ids,
             cu_seqlen_prefill,
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
new file mode 100644
index 00000000..2b346283
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -0,0 +1,559 @@
+# coding=utf-8
+# Copyright 2024 Starcoder2 AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+    FastRMSNorm,
+)
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+
+
+class Starcoder2Config(PretrainedConfig):
+    model_type = "starcoder2"
+
+    def __init__(
+        self,
+        vocab_size=49152,
+        hidden_size=3072,
+        intermediate_size=12288,
+        num_hidden_layers=30,
+        num_attention_heads=24,
+        num_key_value_heads=2,
+        mlp_type="default",
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=4096,
+        initializer_range=0.018042,
+        norm_type="layer_norm",
+        norm_epsilon=1e-5,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        rope_theta=10000.0,
+        sliding_window=None,
+        attention_dropout=0.0,
+        residual_dropout=0.0,
+        embedding_dropout=0.0,
+        use_bias: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.use_bias = use_bias
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_type = mlp_type
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_type = norm_type
+        self.norm_epsilon = norm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.embedding_dropout = embedding_dropout
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=config.use_bias,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.hidden_size % config.num_attention_heads == 0
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.hidden_size // config.num_attention_heads
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    if config.use_bias:
+        w = [
+            weights.get_sharded(f"{p}.bias", dim=0)
+            for p in [f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
+        ]
+        bias = torch.cat(w, dim=0).to(dtype=weights.dtype).to(device=weights.device)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=bias, quantize=config.quantize)
+    )
+
+
+class Starcoder2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.max_past = (
+            config.sliding_window if config.sliding_window is not None else -1
+        )
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=config.use_bias,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
+        reshape_and_cache(
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
+        )
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                window_size_left=self.max_past,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class Starcoder2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.c_fc = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.c_fc",
+            weights=weights,
+            bias=config.use_bias,
+        )
+        self.c_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=config.use_bias,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.c_proj(hidden_states)
+
+
+class Starcoder2GatedMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=config.use_bias,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=config.use_bias,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+STARCODER2_NORMALIZATION_CLASSES = {
+    "layer_norm": FastLayerNorm,
+    "rms_norm": FastRMSNorm,
+}
+
+STARCODER2_MLP_CLASSES = {
+    "default": Starcoder2MLP,
+    "gated": Starcoder2GatedMLP,
+}
+
+
+class Starcoder2Layer(nn.Module):
+    def __init__(self, layer_id, config, weights):
+        super().__init__()
+        prefix = f"model.layers.{layer_id}"
+        self.self_attn = Starcoder2Attention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+
+        self.mlp = STARCODER2_MLP_CLASSES[config.mlp_type](
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+        self.input_layernorm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.norm_epsilon
+        )
+        self.post_attention_layernorm = STARCODER2_NORMALIZATION_CLASSES[
+            config.norm_type
+        ].load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.norm_epsilon,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+        prefill_cache_indices,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            prefill_cache_indices,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, attn_res = self.post_attention_layernorm(
+            attn_output, res
+        )
+
+        mlp_output = self.mlp(normed_attn_res_output)
+
+        return mlp_output, attn_res
+
+
+class Starcoder2Model(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.layers = nn.ModuleList(
+            [
+                Starcoder2Layer(
+                    layer_id,
+                    config,
+                    weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = STARCODER2_NORMALIZATION_CLASSES[config.norm_type].load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.norm_epsilon
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, true_max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+                prefill_cache_indices,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashStarcoder2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.model = Starcoder2Model(prefix, config, weights)
+        try:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix="lm_head",
+                weights=weights,
+            )
+        except RuntimeError:
+            self.lm_head = SpeculativeHead.load(
+                config,
+                prefix=f"{prefix}.embed_tokens",
+                weights=weights,
+            )
+
+        self.max_past = config.sliding_window
+        self.max_past_tensor = (
+            torch.tensor(config.sliding_window, device=weights.device)
+            if self.max_past is not None
+            else None
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        true_max_s = max_s
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif self.max_past is not None:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
+
+        hidden_states = self.model(
+            input_ids,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            true_max_s,
+            prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits = self.lm_head(hidden_states)
+        return logits
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
new file mode 100644
index 00000000..a83bc1c6
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -0,0 +1,830 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Idefics2 model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import math
+
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import select_best_resolution
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+    load_vision_model,
+)
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the need to resize them to the same
+    fixed size. In particular, we start from the original pre-trained SigLIP model
+    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+        self.patch_embedding.bias = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+
+    def forward(
+        self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionAttention(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.embed_dim // self.num_heads
+        if self.head_size * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_size**-0.5
+        self.dropout = config.attention_dropout
+
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+
+        self.qkv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=True,
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, q_len, _ = hidden_states.size()
+
+        qkv = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+                self.head_size * self.num_heads,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            batch_size, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = (
+            torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        )
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class Idefics2VisionMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics2VisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", eps=config.layer_norm_eps, weights=weights
+        )
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", eps=config.layer_norm_eps, weights=weights
+        )
+        self.mlp = Idefics2VisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
+
+    # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+        return hidden_states
+
+
+class Idefics2VisionTransformer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.encoder = Idefics2Encoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.post_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+    ):
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_size = self.config.patch_size
+            patch_attention_mask = torch.ones(
+                (
+                    batch_size,
+                    pixel_values.size(2) // patch_size,
+                    pixel_values.size(3) // patch_size,
+                )
+            )
+            patch_attention_mask = patch_attention_mask.to(
+                dtype=torch.bool, device=pixel_values.device
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask
+        )
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
+        # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            patch_attention_mask = None
+        else:
+            patch_attention_mask = _prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype
+            )
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=patch_attention_mask,
+        )
+
+        last_hidden_state = encoder_outputs
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+class Idefics2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.text_config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+
+    def forward(self, hidden_states):
+        start_shape = hidden_states.shape[:-1]
+        gate_up_states = self.gate_up_proj(hidden_states)
+        intermediate_size = gate_up_states.shape[-1] // 2
+        gate_up_states = gate_up_states.view(-1, 2, intermediate_size)
+        return self.down_proj(
+            self.act(gate_up_states[:, 0]) * gate_up_states[:, 1]
+        ).view(*start_shape, -1)
+
+
+class Idefics2RMSNorm(nn.Module):
+    def __init__(self, prefix, weights, eps):
+        """
+        Idefics2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.weight"), requires_grad=False
+        )
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Idefics2PerceiverAttention(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.layer_idx = None
+        self.hidden_size = config.text_config.hidden_size
+        self.num_heads = config.perceiver_config.resampler_n_heads
+        self.head_size = config.perceiver_config.resampler_head_dim
+        self.num_key_value_heads = config.perceiver_config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attention_dropout = config.perceiver_config.attention_dropout
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            self.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.q_proj = TensorParallelColumnLinear.load(
+            config,
+            prefix=f"{prefix}.q_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.kv = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+        self.o_proj = TensorParallelRowLinear.load(
+            config=config, prefix=f"{prefix}.o_proj", weights=weights, bias=False
+        )
+
+        self.is_causal = False
+
+    def forward(
+        self,
+        latents: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = latents.size()
+        kv_seq_len = q_len + context.size()[1]
+
+        hidden_states = torch.concat([context, latents], dim=-2)
+        query_states = self.q_proj(latents)
+        kv = self.kv(hidden_states)
+        key_states, value_states = kv.split(
+            [
+                self.head_size * self.num_key_value_heads,
+                self.head_size * self.num_key_value_heads,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_size
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, kv_seq_len, self.num_key_value_heads, self.head_size
+        ).transpose(1, 2)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_size)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output
+
+
+class Idefics2PerceiverLayer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.hidden_size = config.text_config.hidden_size
+        self.n_latents = config.perceiver_config.resampler_n_latents
+        self.depth = config.perceiver_config.resampler_depth
+        self.rms_norm_eps = config.text_config.rms_norm_eps
+
+        self.input_latents_norm = Idefics2RMSNorm(
+            prefix=f"{prefix}.input_latents_norm",
+            weights=weights,
+            eps=self.rms_norm_eps,
+        )
+        self.input_context_norm = Idefics2RMSNorm(
+            prefix=f"{prefix}.input_context_norm",
+            weights=weights,
+            eps=self.rms_norm_eps,
+        )
+        self.self_attn = Idefics2PerceiverAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.post_attention_layernorm = Idefics2RMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=self.rms_norm_eps,
+        )
+        self.mlp = Idefics2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+    def forward(
+        self,
+        latents: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+        """
+        residual = latents
+
+        latents = self.input_latents_norm(latents)
+        context = self.input_context_norm(context)
+
+        latents = self.self_attn(
+            latents=latents,
+            context=context,
+            attention_mask=attention_mask,
+        )
+        latents = residual + latents
+        residual = latents
+
+        latents = self.post_attention_layernorm(latents)
+        latents = self.mlp(latents)
+        latents = residual + latents
+
+        return latents
+
+
+class Idefics2PerceiverResampler(nn.Module):
+    def __init__(self, prefix, config, weights) -> None:
+        super().__init__()
+        self.hidden_size = config.text_config.hidden_size
+        self.hidden_act = config.perceiver_config.hidden_act
+        self.n_latents = config.perceiver_config.resampler_n_latents
+        self.depth = config.perceiver_config.resampler_depth
+        self.rms_norm_eps = config.text_config.rms_norm_eps
+
+        # Create Latents for Perceiver
+        self.latents = weights.get_tensor(f"{prefix}.latents")
+
+        # Create Transformer Blocks
+        self.layers = nn.ModuleList(
+            [
+                Idefics2PerceiverLayer(
+                    prefix=f"{prefix}.layers.{idx}", config=config, weights=weights
+                )
+                for idx in range(self.depth)
+            ]
+        )
+        self.norm = Idefics2RMSNorm(
+            prefix=f"{prefix}.norm",
+            weights=weights,
+            eps=config.text_config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        context: torch.Tensor,
+        attention_mask,
+    ) -> torch.Tensor:
+        # seq embed -> bsz seq embed
+        latents = self.latents.unsqueeze(0).expand(
+            (context.shape[0], *self.latents.size())
+        )
+
+        latent_attention_mask = torch.ones(
+            (attention_mask.size(0), latents.size(1)),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1)
+        attention_mask = _prepare_4d_attention_mask(
+            attention_mask, latents.dtype, tgt_len=self.n_latents
+        )
+
+        compressed_context = latents
+        for perceiver_layer in self.layers:
+            compressed_context = perceiver_layer(
+                compressed_context,
+                context,
+                attention_mask=attention_mask,
+            )
+        compressed_context = self.norm(compressed_context)
+
+        return compressed_context
+
+
+class Idefics2Connector(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.modality_projection = Idefics2MLP(
+            prefix=f"{prefix}.modality_projection", config=config, weights=weights
+        )
+        self.perceiver_resampler = Idefics2PerceiverResampler(
+            prefix=f"{prefix}.perceiver_resampler", config=config, weights=weights
+        )
+
+    def forward(self, image_hidden_states, attention_mask):
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        image_hidden_states = self.perceiver_resampler(
+            context=image_hidden_states, attention_mask=attention_mask
+        )
+        return image_hidden_states
+
+
+class Idefics2ForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = config.quantize
+        config.vision_config.speculator = config.speculator
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+
+        vision_config = config.vision_config
+        self.text_model = load_text_model(
+            prefix="model" if not prefix else f"{prefix}.model",
+            config=config.text_config,
+            weights=weights,
+            name="text_model",
+        )
+        self.dtype = weights.dtype
+        self.vision_model = Idefics2VisionTransformer(
+            prefix=f"{prefix}.model.vision_model" if prefix else "model.vision_model",
+            config=vision_config,
+            weights=weights,
+        )
+        self.connector = Idefics2Connector(
+            prefix=f"{prefix}.model.connector" if prefix else "model.connector",
+            config=config,
+            weights=weights,
+        )
+        self.config = config
+        self.image_seq_len = config.perceiver_config.resampler_n_latents
+        self.image_token_id = config.image_token_id
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def _merge_input_ids_with_image_features(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        # mask = input_ids == self.config.image_token_index
+        mask = input_ids == self.config.image_token_id
+        # Let's pray we have enabled enough slots !
+        inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        # Unused here
+        image_sizes: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ):
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            all_states = []
+            all_pixel_values = pixel_values
+            all_pixel_mask = pixel_attention_mask
+            for i in range(batch_size):
+                pixel_values = all_pixel_values.to(
+                    dtype=self.dtype
+                )  # fp16 compatibility
+                pixel_values = pixel_values[i : i + 1]
+                pixel_values = pixel_values.view(num_images, *pixel_values.shape[2:])
+
+                # Remove padding images - padding images are full 0.
+                nb_values_per_image = pixel_values.shape[1:].numel()
+                real_images_inds = (pixel_values == 0.0).sum(
+                    dim=(-1, -2, -3)
+                ) != nb_values_per_image
+                pixel_values = pixel_values[real_images_inds].contiguous()
+
+                # Handle the vision attention mask
+                if pixel_attention_mask is None:
+                    pixel_attention_mask = torch.ones(
+                        size=(
+                            pixel_values.size(0),
+                            pixel_values.size(2),
+                            pixel_values.size(3),
+                        ),
+                        dtype=torch.bool,
+                        device=pixel_values.device,
+                    )
+                else:
+                    # Remove padding images from the mask/pP p
+                    pixel_attention_mask = all_pixel_mask[i : i + 1]
+                    pixel_attention_mask = pixel_attention_mask.view(
+                        1 * num_images, *pixel_attention_mask.shape[2:]
+                    )
+                    pixel_attention_mask = pixel_attention_mask[
+                        real_images_inds
+                    ].contiguous()
+
+                patch_size = self.config.vision_config.patch_size
+                patches_subgrid = pixel_attention_mask.unfold(
+                    dimension=1, size=patch_size, step=patch_size
+                )
+                patches_subgrid = patches_subgrid.unfold(
+                    dimension=2, size=patch_size, step=patch_size
+                )
+                patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+                # Get sequence from the vision encoder
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values,
+                    patch_attention_mask=patch_attention_mask,
+                )
+
+                # Modality projection & resampling
+                image_hidden_states = self.connector(
+                    image_hidden_states,
+                    attention_mask=patch_attention_mask.view(pixel_values.size(0), -1),
+                )
+                all_states.append(image_hidden_states)
+            image_hidden_states = torch.stack(all_states, dim=0)
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self._merge_input_ids_with_image_features(
+                input_ids, inputs_embeds, image_hidden_states
+            )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            input_lengths=input_lengths,
+            max_s=max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=None,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/idefics_config.py b/server/text_generation_server/models/custom_modeling/idefics_config.py
index 34925087..a5565819 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_config.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_config.py
@@ -51,7 +51,7 @@ class IdeficsVisionConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         image_num_channels (`int`, *optional*, defaults to `3`):
             Number of image channels.
-        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
         layer_norm_eps (`float`, *optional*, defaults to 1e-5):
@@ -66,6 +66,7 @@ class IdeficsVisionConfig(PretrainedConfig):
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     """
+
     model_type = "idefics"
     attribute_map = {
         "hidden_size": "embed_dim",
@@ -80,7 +81,7 @@ class IdeficsVisionConfig(PretrainedConfig):
         num_hidden_layers=32,
         num_attention_heads=16,
         num_channels=3,
-        hidden_act="quick_gelu",
+        hidden_act="gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
@@ -125,6 +126,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
         qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
             Whether or not to use qk layer norms in perceiver
     """
+
     model_type = "idefics"
 
     def __init__(
@@ -219,6 +221,7 @@ class IdeficsConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "idefics"
     is_composition = True
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
index aec9a3dc..e323d365 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
@@ -20,7 +20,12 @@ import numpy as np
 from PIL import Image
 
 from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
-from transformers.image_transforms import resize, to_channel_dimension_format, rescale, normalize
+from transformers.image_transforms import (
+    resize,
+    to_channel_dimension_format,
+    rescale,
+    normalize,
+)
 from transformers.image_utils import (
     ChannelDimension,
     ImageInput,
@@ -30,6 +35,7 @@ from transformers.image_utils import (
     valid_images,
 )
 from io import BytesIO
+import base64
 import requests
 from transformers import TensorType, is_torch_available
 
@@ -121,7 +127,11 @@ class IdeficsImageProcessor(BaseImageProcessor):
             a PyTorch tensor of the processed images
         """
         image_size = image_size if image_size is not None else self.image_size
-        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_num_channels = (
+            image_num_channels
+            if image_num_channels is not None
+            else self.image_num_channels
+        )
         image_mean = image_mean if image_mean is not None else self.image_mean
         image_std = image_std if image_std is not None else self.image_std
         size = (image_size, image_size)
@@ -160,9 +170,13 @@ class IdeficsImageProcessor(BaseImageProcessor):
         images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
         images = [self.rescale(image=image, scale=1 / 255) for image in images]
         images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
-        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        images = [
+            to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images
+        ]
         # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
-        images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
+        images = BatchFeature(
+            data={"pixel_values": images}, tensor_type=TensorType.PYTORCH
+        )["pixel_values"]
 
         return images
 
@@ -181,11 +195,32 @@ class IdeficsImageProcessor(BaseImageProcessor):
         if isinstance(image_url_or_urls, list):
             return [self.fetch_images(x) for x in image_url_or_urls]
         elif isinstance(image_url_or_urls, str):
-            response = requests.get(image_url_or_urls, stream=True, headers=headers)
-            response.raise_for_status()
-            return Image.open(BytesIO(response.content))
+            image = image_url_or_urls
+
+            if image.startswith("http://") or image.startswith("https://"):
+                response = requests.get(
+                    image_url_or_urls, stream=True, headers=headers, timeout=(1, 5)
+                )
+                response.raise_for_status()
+                content = response.content
+            elif image.startswith("data:"):
+                # https://stackoverflow.com/questions/17090571/is-there-a-way-to-set-background-image-as-a-base64-encoded-image
+                # data:image/png;base64,xxx
+                image = image.split(",")[-1]
+                content = base64.b64decode(image)
+            else:
+                raise ValueError(f"Unrecognized image {image}")
+
+            try:
+                image = Image.open(BytesIO(content))
+                # image.verify()
+            except Exception:
+                raise ValueError(f"Could not load image from url {image_url_or_urls}")
+            return image
         else:
-            raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}")
+            raise ValueError(
+                f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}"
+            )
 
     def rescale(
         self,
@@ -255,10 +290,9 @@ class IdeficsImageProcessor(BaseImageProcessor):
             `np.ndarray`: The normalized image.
         """
         # TODO 4.32
-        return normalize(
-            image, mean=mean, std=std, data_format=data_format, **kwargs
-        )
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
 
 
 import transformers
+
 transformers.IdeficsImageProcessor = IdeficsImageProcessor
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index 8b43ae4d..786ef559 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -28,7 +28,11 @@ from torch.nn import CrossEntropyLoss
 
 from transformers import PreTrainedModel
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, dataclass
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    dataclass,
+)
 from transformers.modeling_utils import PretrainedConfig
 from transformers.utils import (
     add_start_docstrings,
@@ -37,22 +41,35 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from text_generation_server.models.custom_modeling.idefics_config import IdeficsConfig
-from text_generation_server.models.custom_modeling.idefics_vision import IdeficsVisionTransformer
-from text_generation_server.models.custom_modeling.idefics_perceiver import IdeficsPerceiverResampler
-from text_generation_server.utils.layers import (
+from text_generation_server.models.custom_modeling.idefics_vision import (
+    IdeficsVisionTransformer,
+)
+from text_generation_server.models.custom_modeling.idefics_perceiver import (
+    IdeficsPerceiverResampler,
+)
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
-    TensorParallelHead,
-    PositionRotaryEmbedding,
+    SpeculativeHead,
     FastLinear,
 )
-import dropout_layer_norm
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "cuda":
+    import dropout_layer_norm
+elif SYSTEM == "rocm":
+    from vllm._C import ops
+else:
+    dropout_layer_norm = None
+
 
 @dataclass
 class BaseModelOutputWithPastImage(BaseModelOutputWithPast):
     image_hidden_states: Optional[torch.FloatTensor] = None
 
+
 @dataclass
 class CausalLMOutputWithPastImage(CausalLMOutputWithPast):
     image_hidden_states: Optional[torch.FloatTensor] = None
@@ -78,26 +95,40 @@ def expand_inputs_for_generation(
     **model_kwargs,
 ):
     expanded_return_idx = (
-        torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+        torch.arange(input_ids.shape[0])
+        .view(-1, 1)
+        .repeat(1, expand_size)
+        .view(-1)
+        .to(input_ids.device)
     )
     input_ids = input_ids.index_select(0, expanded_return_idx)
 
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
-
-    if attention_mask is not None:
-        model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
-        model_kwargs["image_attention_mask"] = model_kwargs["image_attention_mask"].index_select(
+        model_kwargs["token_type_ids"] = token_type_ids.index_select(
+            0, expanded_return_idx
+        )
+
+    if attention_mask is not None:
+        model_kwargs["attention_mask"] = attention_mask.index_select(
+            0, expanded_return_idx
+        )
+        model_kwargs["image_attention_mask"] = model_kwargs[
+            "image_attention_mask"
+        ].index_select(0, expanded_return_idx)
+        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(
             0, expanded_return_idx
         )
-        model_kwargs["pixel_values"] = model_kwargs["pixel_values"].index_select(0, expanded_return_idx)
 
     if is_encoder_decoder:
         if encoder_outputs is None:
-            raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
-        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
-            0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
+            raise ValueError(
+                "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
+            )
+        encoder_outputs["last_hidden_state"] = (
+            encoder_outputs.last_hidden_state.index_select(
+                0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
+            )
         )
         model_kwargs["encoder_outputs"] = encoder_outputs
     return input_ids, model_kwargs
@@ -120,14 +151,17 @@ def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder
     # update token_type_ids with last value
     if "token_type_ids" in model_kwargs:
         token_type_ids = model_kwargs["token_type_ids"]
-        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+        model_kwargs["token_type_ids"] = torch.cat(
+            [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1
+        )
 
     # update attention masks
     if not is_encoder_decoder:
         if "attention_mask" in model_kwargs:
             attention_mask = model_kwargs["attention_mask"]
             model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))],
+                dim=-1,
             )
         if "image_attention_mask" in model_kwargs:
             image_attention_mask = model_kwargs["image_attention_mask"]
@@ -180,8 +214,12 @@ def freeze_model(model, module_exceptions=[]):
     }
     module_exceptions_mapped = [mapping[m] for m in module_exceptions]
     for module in model.modules():
-        if module_exceptions and any([isinstance(module, t) for t in module_exceptions_mapped]):
-            module.requires_grad_(True)  # Explicitely setting it to true to avoid any mistakes
+        if module_exceptions and any(
+            [isinstance(module, t) for t in module_exceptions_mapped]
+        ):
+            module.requires_grad_(
+                True
+            )  # Explicitely setting it to true to avoid any mistakes
         else:
             module.requires_grad_(False)
     return model
@@ -195,15 +233,21 @@ class IdeficsDecoupledPartialTPEmbedding(nn.Module):
     ):
         super().__init__()
         self.num_embeddings = config.vocab_size
-        self.weight = TensorParallelEmbedding(prefix="model.embed_tokens", weights=weights)
-        self.additional_weight = nn.Parameter(weights.get_tensor(f"model.embed_tokens.additional_embedding.weight"))
+        self.weight = TensorParallelEmbedding(
+            prefix="model.embed_tokens", weights=weights
+        )
+        self.additional_weight = nn.Parameter(
+            weights.get_tensor(f"model.embed_tokens.additional_embedding.weight")
+        )
 
     def forward(self, input_ids):
         # Clone so that we don't modify the original input_ids later on
         input_ids = input_ids.clone()
         additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
         input_ids_additional_vocab = input_ids[additional_vocab_indices]
-        additional_embeddings = torch.nn.functional.embedding(input_ids_additional_vocab - self.num_embeddings, self.additional_weight)
+        additional_embeddings = torch.nn.functional.embedding(
+            input_ids_additional_vocab - self.num_embeddings, self.additional_weight
+        )
 
         # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
         input_ids[additional_vocab_indices] = 0
@@ -230,19 +274,20 @@ class IdeficsDecoupledTensorParallelLinear(nn.Module):
         weights,
     ) -> None:
         super().__init__()
-        self.fc = TensorParallelHead.load(
-            config=config, prefix="lm_head", weights=weights
-        )
+        self.fc = SpeculativeHead.load(config=config, prefix="lm_head", weights=weights)
         self.additional_fc = FastLinear.load(
-            config=config, prefix="lm_head.additional_fc", weights=weights, bias=False,
+            config=config,
+            prefix="lm_head.additional_fc",
+            weights=weights,
+            bias=False,
         )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = self.fc(input)
+        output, speculative_logits = self.fc(input)
         additional_features = self.additional_fc(input)
         output = torch.cat((output, additional_features), -1)
 
-        return output
+        return output, speculative_logits
 
     def extra_repr(self) -> str:
         """Overwriting `nn.Linear.extra_repr` to include new parameters."""
@@ -257,7 +302,10 @@ class IdeficsDecoupledTensorParallelLinear(nn.Module):
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
 ):
     """
     Make causal mask used for bi-directional self-attention.
@@ -269,8 +317,18 @@ def _make_causal_mask(
     mask = mask.to(dtype)
 
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
 
 
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
@@ -284,7 +342,9 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
     inverted_mask = 1.0 - expanded_mask
 
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
 
 
 class IdeficsRMSNorm(nn.Module):
@@ -315,7 +375,7 @@ class IdeficsRMSNorm(nn.Module):
                 hidden_states = hidden_states.to(self.weight.dtype)
 
             return self.weight * hidden_states
-        else:
+        elif SYSTEM == "cuda":
             # faster post attention rms norm
             unwrap = False
             if len(hidden_states.shape) > 2:
@@ -346,8 +406,35 @@ class IdeficsRMSNorm(nn.Module):
             if unwrap:
                 normed_hidden_states = normed_hidden_states.view(*shape)
 
-
             return normed_hidden_states
+        elif SYSTEM == "rocm":
+            # We use VLLM RMSNorm kernel that can be compiled for RoCm, instead of Flash Attention ones that can not.
+            if residual is not None:
+                hidden_states += residual
+            residual = hidden_states
+
+            unwrap = False
+            if len(hidden_states.shape) > 2:
+                unwrap = True
+                shape = hidden_states.shape
+                hidden_states = hidden_states.reshape(-1, shape[-1])
+
+            out = torch.empty_like(hidden_states)
+            ops.rms_norm(
+                out,
+                hidden_states,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+
+            if unwrap:
+                out = out.view(*shape)
+
+            return out
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
 
 
 # this was adapted from LlamaMLP
@@ -367,7 +454,10 @@ class IdeficsMLP(nn.Module):
             bias=False,
         )
         self.down_proj = TensorParallelRowLinear.load(
-            config, prefix=f"{prefix}.down_proj", weights=weights, bias=False,
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
         )
         self.act_fn = ACT2FN[config.hidden_act]
 
@@ -375,7 +465,9 @@ class IdeficsMLP(nn.Module):
         gate_up_states = self.gate_up_proj(hidden_states)
         shape = gate_up_states.shape
         gate_up_states = gate_up_states.view(*shape[:-1], 2, shape[-1] // 2)
-        return self.down_proj(self.act_fn(gate_up_states[:, :, 0]) * gate_up_states[:, :, 1])
+        return self.down_proj(
+            self.act_fn(gate_up_states[:, :, 0]) * gate_up_states[:, :, 1]
+        )
 
 
 # this was adapted from LlamaAttention
@@ -445,14 +537,22 @@ class IdeficsAttention(nn.Module):
         self.qk_layer_norms = qk_layer_norms
         if self.qk_layer_norms:
             self.q_layer_norm = IdeficsRMSNorm(
-            prefix=f"{prefix}.q_layer_norm", weights=weights, eps=config.rms_norm_eps
-        )
+                prefix=f"{prefix}.q_layer_norm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
             self.k_layer_norm = IdeficsRMSNorm(
-            prefix=f"{prefix}.q_layer_norm", weights=weights, eps=config.rms_norm_eps
-        )
+                prefix=f"{prefix}.q_layer_norm",
+                weights=weights,
+                eps=config.rms_norm_eps,
+            )
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
 
     def forward(
         self,
@@ -470,20 +570,42 @@ class IdeficsAttention(nn.Module):
         bsz, q_len, _ = hidden_states.size()
 
         if is_cross_attention:
-            query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)# .transpose(1, 2)
+            query_states = self.q_proj(hidden_states).view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
             query_states = query_states.transpose(1, 2)
-            _, kv_len, _ = key_value_states.size()  # Note that, in this case, `kv_len` == `kv_seq_len`
-            key_states = self.k_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+            (
+                _,
+                kv_len,
+                _,
+            ) = (
+                key_value_states.size()
+            )  # Note that, in this case, `kv_len` == `kv_seq_len`
+            key_states = (
+                self.k_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
+            )
             value_states = (
-                self.v_proj(key_value_states).view(bsz, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+                self.v_proj(key_value_states)
+                .view(bsz, kv_len, self.num_heads, self.head_dim)
+                .transpose(1, 2)
             )
         else:
             qkv = self.qkv(hidden_states)
-            query_states, key_states, value_states = qkv.split(self.num_heads * self.head_dim, dim=2)
+            query_states, key_states, value_states = qkv.split(
+                self.num_heads * self.head_dim, dim=2
+            )
 
-            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)# .transpose(1, 2)
-            key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim)# . transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim)# .transpose(1, 2)
+            query_states = query_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
+            key_states = key_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # . transpose(1, 2)
+            value_states = value_states.view(
+                bsz, q_len, self.num_heads, self.head_dim
+            )  # .transpose(1, 2)
             kv_seq_len = q_len
             if past_key_value is not None:
                 kv_seq_len += past_key_value[0].shape[-2]
@@ -492,11 +614,17 @@ class IdeficsAttention(nn.Module):
                 position_ids.view(-1), max_s, hidden_states.dtype
             )
 
-            shape = query_states.shape
-            query_states = self.rotary_emb(query_states.view(-1, *shape[2:]), cos, sin).view(shape)
+            query_shape = query_states.shape
+            key_shape = key_states.shape
+            self.rotary_emb(
+                query_states.view(-1, *query_shape[2:]),
+                key_states.reshape(-1, *key_shape[2:]),
+                cos,
+                sin,
+            )
 
-            shape = key_states.shape
-            key_states = self.rotary_emb(key_states.reshape(-1, *shape[2:]), cos, sin).view(shape)
+            query_states = query_states.view(query_shape)
+            key_states = key_states.view(key_shape)
 
             query_states = query_states.transpose(1, 2)
             key_states = key_states.transpose(1, 2)
@@ -571,8 +699,14 @@ class IdeficsDecoderLayer(nn.Module):
             prefix=f"{prefix}.mlp",
             weights=weights,
         )
-        self.input_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=config.rms_norm_eps)
+        self.input_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
         self.dropout = config.dropout
 
     def forward(
@@ -583,7 +717,9 @@ class IdeficsDecoderLayer(nn.Module):
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -650,14 +786,22 @@ class IdeficsGatedCrossAttentionLayer(nn.Module):
             prefix=f"{prefix}.mlp",
             weights=weights,
         )
-        self.input_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = IdeficsRMSNorm(prefix=f"{prefix}.post_attention_layernorm", weights=weights, eps=config.rms_norm_eps)
+        self.input_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = IdeficsRMSNorm(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
         self.config = config.dropout
 
         self.act_cross_attn = nn.Tanh()
         self.act_dense = nn.Tanh()
 
-        self.alpha_cross_attn = nn.Parameter(weights.get_tensor(f"{prefix}.alpha_cross_attn"))
+        self.alpha_cross_attn = nn.Parameter(
+            weights.get_tensor(f"{prefix}.alpha_cross_attn")
+        )
         self.alpha_dense = nn.Parameter(weights.get_tensor(f"{prefix}.alpha_dense"))
 
         if not (hasattr(self, "alpha_cross_attn") and hasattr(self, "alpha_dense")):
@@ -673,7 +817,9 @@ class IdeficsGatedCrossAttentionLayer(nn.Module):
         use_cache: Optional[bool] = False,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         no_images: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
@@ -695,7 +841,9 @@ class IdeficsGatedCrossAttentionLayer(nn.Module):
             )
 
         if past_key_value is not None:
-            raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")
+            raise NotImplementedError(
+                "Past key value states are not implemented for Idefics cross attention module."
+            )
 
         residual = hidden_states
 
@@ -711,7 +859,9 @@ class IdeficsGatedCrossAttentionLayer(nn.Module):
         # hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
         # when there are no images the model is used in pure language mode
         gate = 0 if no_images else 1
-        hidden_states = residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        hidden_states = (
+            residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        )
 
         # Fully Connected
         residual = hidden_states
@@ -896,11 +1046,14 @@ class IdeficsModel(IdeficsPreTrainedModel):
         self.gated_cross_attn_layers = nn.ModuleList(
             [
                 IdeficsGatedCrossAttentionLayer(layer_id, config, weights)
-                for layer_id in range(num_cross_layers)]
+                for layer_id in range(num_cross_layers)
+            ]
         )
         # self.gradient_checkpointing = False
 
-        self.norm = IdeficsRMSNorm(prefix=f"model.norm", weights=weights, eps=config.rms_norm_eps)
+        self.norm = IdeficsRMSNorm(
+            prefix=f"model.norm", weights=weights, eps=config.rms_norm_eps
+        )
 
         # self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -932,7 +1085,9 @@ class IdeficsModel(IdeficsPreTrainedModel):
     #     self.embed_tokens = value
 
     # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         combined_attention_mask = None
@@ -946,11 +1101,13 @@ class IdeficsModel(IdeficsPreTrainedModel):
 
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
             combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
             )
 
         return combined_attention_mask
@@ -974,23 +1131,35 @@ class IdeficsModel(IdeficsPreTrainedModel):
     ) -> Union[Tuple, BaseModelOutputWithPastImage]:
         device = input_ids.device if input_ids is not None else inputs_embeds.device
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
 
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
             batch_size, seq_length, _ = inputs_embeds.shape
         else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
 
         seq_length_with_past = seq_length
         past_key_values_length = 0
@@ -1006,7 +1175,10 @@ class IdeficsModel(IdeficsPreTrainedModel):
         elif position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
             position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
             )
             position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
         else:
@@ -1016,29 +1188,52 @@ class IdeficsModel(IdeficsPreTrainedModel):
 
         if image_hidden_states is None:
             if pixel_values is None and image_embeddings is None:
-                raise ValueError("Either pixel_values and image_embeddings have to be not-None.")
+                raise ValueError(
+                    "Either pixel_values and image_embeddings have to be not-None."
+                )
 
             elif pixel_values is not None and image_embeddings is not None:
-                raise ValueError("You cannot specify both pixel_values and image_embeddings at the same time")
+                raise ValueError(
+                    "You cannot specify both pixel_values and image_embeddings at the same time"
+                )
 
             elif pixel_values is not None:
                 no_images = len(torch.nonzero(pixel_values)) == 0
-                pixel_values = pixel_values.to(dtype=self.dtype, device=device)  # fp16 compatibility
+                pixel_values = pixel_values.to(
+                    dtype=self.dtype, device=device
+                )  # fp16 compatibility
                 batch_size, num_images = pixel_values.shape[:2]
-                pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
+                pixel_values = pixel_values.contiguous().view(
+                    batch_size * num_images, *pixel_values.shape[2:]
+                )
 
                 # Get sequence from the vision encoder
-                image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
+                image_hidden_states = self.vision_model(
+                    pixel_values=pixel_values
+                ).last_hidden_state
 
             elif image_embeddings is not None:
-                batch_size, num_images, image_seq_len, image_hidden_size = image_embeddings.size()
-                image_hidden_states = image_embeddings.to(dtype=self.dtype, device=input_ids.device)
-                image_hidden_states = image_hidden_states.view(batch_size * num_images, image_seq_len, image_hidden_size)
+                (
+                    batch_size,
+                    num_images,
+                    image_seq_len,
+                    image_hidden_size,
+                ) = image_embeddings.size()
+                image_hidden_states = image_embeddings.to(
+                    dtype=self.dtype, device=input_ids.device
+                )
+                image_hidden_states = image_hidden_states.view(
+                    batch_size * num_images, image_seq_len, image_hidden_size
+                )
 
             if self.config.use_resampler:
                 image_hidden_states = self.perceiver_resampler(image_hidden_states)
-            image_seq_len, image_hidden_size = image_hidden_states.size(1), image_hidden_states.size(2)
-            image_hidden_states = image_hidden_states.view(batch_size, num_images * image_seq_len, image_hidden_size)
+            image_seq_len, image_hidden_size = image_hidden_states.size(
+                1
+            ), image_hidden_states.size(2)
+            image_hidden_states = image_hidden_states.view(
+                batch_size, num_images * image_seq_len, image_hidden_size
+            )
         else:
             no_images = False
             num_images = pixel_values.shape[1]
@@ -1050,7 +1245,9 @@ class IdeficsModel(IdeficsPreTrainedModel):
         text_seq_len = image_attention_mask.size(1)
         image_attention_mask = image_attention_mask.unsqueeze(-1)
         image_attention_mask = image_attention_mask.repeat(1, 1, 1, image_seq_len)
-        image_attention_mask = image_attention_mask.view(batch_size, text_seq_len, num_images * image_seq_len)
+        image_attention_mask = image_attention_mask.view(
+            batch_size, text_seq_len, num_images * image_seq_len
+        )
         image_batch_size, image_sequence_length, _ = image_hidden_states.size()
         image_hidden_shape = (image_batch_size, image_sequence_length)
         if image_attention_mask is None:
@@ -1060,7 +1257,6 @@ class IdeficsModel(IdeficsPreTrainedModel):
         # if list(image_attention_mask.shape) != [4, 1, 1024, 64]:
         #     raise ValueError(f"Image hidden_states {image_hidden_states.shape} - mask {image_attention_mask.shape} {num_images} {image_seq_len} {text_seq_len}")
 
-
         # if image_hidden_states is not None:
         # else:
         #     image_attention_mask = None
@@ -1070,10 +1266,15 @@ class IdeficsModel(IdeficsPreTrainedModel):
         # embed positions
         if attention_mask is None:
             attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+                (batch_size, seq_length_with_past),
+                dtype=torch.bool,
+                device=inputs_embeds.device,
             )
         attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
         )
 
         hidden_states = inputs_embeds
@@ -1094,7 +1295,9 @@ class IdeficsModel(IdeficsPreTrainedModel):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
 
             def vblock(
                 main_block,
@@ -1194,7 +1397,11 @@ class IdeficsModel(IdeficsPreTrainedModel):
 
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
         return BaseModelOutputWithPastImage(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1230,7 +1437,7 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         image_embeddings: Optional[torch.FloatTensor] = None,
-        image_hidden_states:  Optional[torch.FloatTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
         image_attention_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1264,11 +1471,19 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
@@ -1288,17 +1503,20 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
         )
 
         hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
 
         loss = None
 
-        return CausalLMOutputWithPastImage(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            image_hidden_states=outputs.image_hidden_states
+        return (
+            CausalLMOutputWithPastImage(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+                image_hidden_states=outputs.image_hidden_states,
+            ),
+            speculative_logits,
         )
 
     def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
@@ -1316,12 +1534,20 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
         return expand_inputs_for_generation(*args, **model_kwargs)
 
     @staticmethod
-    def _update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
-        return update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder)
+    def _update_model_kwargs_for_generation(
+        outputs, model_kwargs, is_encoder_decoder=False
+    ):
+        return update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
+        )
 
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
         return reordered_past
diff --git a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
index def78390..af44490b 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_perceiver.py
@@ -41,12 +41,13 @@ from typing import Optional, Tuple
 import torch
 import torch.nn as nn
 
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
 )
 
-EPS=1e-5
+EPS = 1e-5
+
 
 class IdeficsPerceiverResampler(nn.Module):
     def __init__(
@@ -78,7 +79,12 @@ class IdeficsPerceiverResampler(nn.Module):
 
         """
         super().__init__()
-        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = embed_dim, n_heads, head_dim, n_latents
+        self.embed_dim, self.n_heads, self.head_dim, self.n_latents = (
+            embed_dim,
+            n_heads,
+            head_dim,
+            n_latents,
+        )
         self.qk_layer_norms = config.perceiver_config.qk_layer_norms_perceiver
 
         # Create Latents for Perceiver
@@ -107,14 +113,16 @@ class IdeficsPerceiverResampler(nn.Module):
                             prefix=f"{prefix}.blocks.{layer_id}.1",
                             intermediate_size=self.intermediate_dim,
                             config=config,
-                            weights=weights
+                            weights=weights,
                         ),
                     ]
                 )
                 for layer_id in range(depth)
             ]
         )
-        self.layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS)
+        self.layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm", weights=weights, eps=EPS
+        )
 
     def forward(self, context: torch.Tensor) -> torch.Tensor:
         """Resample arbitrary length context & *compress* down to self.n_latents latent embeddings"""
@@ -130,25 +138,34 @@ class IdeficsPerceiverResampler(nn.Module):
 
 
 class IdeficsPerceiverAttention(nn.Module):
-    def __init__(self,
-            prefix,
-            config,
-            embed_dim: int,
-            n_heads: int,
-            head_dim: int,
-            qk_layer_norms: bool,
-            weights
-        ) -> None:
+    def __init__(
+        self,
+        prefix,
+        config,
+        embed_dim: int,
+        n_heads: int,
+        head_dim: int,
+        qk_layer_norms: bool,
+        weights,
+    ) -> None:
         """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
         super().__init__()
         self.embed_dim, self.n_heads, self.head_dim = embed_dim, n_heads, head_dim
         self.qk_layer_norms = qk_layer_norms
         # Normalization & Scaling
-        self.context_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS)
-        self.latents_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS)
+        self.context_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.context_layer_norm", weights=weights, eps=EPS
+        )
+        self.latents_layer_norm = nn.LayerNorm.load(
+            prefix=f"{prefix}.latents_layer_norm", weights=weights, eps=EPS
+        )
         if self.qk_layer_norms:
-            self.q_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS)
-            self.k_layer_norm = nn.LayerNorm.load(prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS)
+            self.q_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}.q_layer_norm", weights=weights, eps=EPS
+            )
+            self.k_layer_norm = nn.LayerNorm.load(
+                prefix=f"{prefix}.k_layer_norm", weights=weights, eps=EPS
+            )
 
         self.qk_scale = self.head_dim**-0.5
 
@@ -164,10 +181,10 @@ class IdeficsPerceiverAttention(nn.Module):
         self.q_proj = TensorParallelColumnLinear.load(
             config=config, prefix=f"{prefix}.q_proj", weights=weights, bias=False
         )
-        self.k_proj =  TensorParallelColumnLinear.load(
+        self.k_proj = TensorParallelColumnLinear.load(
             config=config, prefix=f"{prefix}.k_proj", weights=weights, bias=False
         )
-        self.v_proj =  TensorParallelColumnLinear.load(
+        self.v_proj = TensorParallelColumnLinear.load(
             config=config, prefix=f"{prefix}.v_proj", weights=weights, bias=False
         )
 
@@ -202,7 +219,12 @@ class IdeficsPerceiverAttention(nn.Module):
         # Multiheaded Self-Attention w/ stable softmax (subtract per-row max -- `amax` -- before softmax call)
         #   =>> `attn` should be a 2D matrix of shape [n_latents x (context + n_latents)]
         # einsum.rearrange(x, "bsz seq (heads embed) -> bsz heads seq embed", heads=self.n_heads)
-        q, k, v = [x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(1, 2) for x in (q, k, v)]
+        q, k, v = [
+            x.reshape(batch_size, x.shape[1], self.n_heads, self.head_dim).transpose(
+                1, 2
+            )
+            for x in (q, k, v)
+        ]
 
         if self.qk_layer_norms:
             q = self.q_layer_norm(q)
@@ -219,25 +241,34 @@ class IdeficsPerceiverAttention(nn.Module):
 
 
 class IdeficsMLP(nn.Module):
-    def __init__(self,
-            prefix,
-            intermediate_size,
-            config,
-            weights,
-        ):
+    def __init__(
+        self,
+        prefix,
+        intermediate_size,
+        config,
+        weights,
+    ):
         """Simple MLP block with intermediate_size and embedding size"""
         super().__init__()
         self.embed_dim = config.vision_config.embed_dim
         self.ln = nn.LayerNorm.load(prefix=f"{prefix}.ln", weights=weights, eps=EPS)
         self.fc = TensorParallelColumnLinear.load(
-            config=config, prefix=f"{prefix}.fc", weights=weights, bias=False,
+            config=config,
+            prefix=f"{prefix}.fc",
+            weights=weights,
+            bias=False,
         )
         self.act = nn.ReLU()
         self.c_proj = TensorParallelRowLinear.load(
-            config=config, prefix=f"{prefix}.c_proj", weights=weights, bias=False,
+            config=config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=False,
         )
 
-    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+    def forward(
+        self, hidden_states: Optional[Tuple[torch.FloatTensor]]
+    ) -> torch.FloatTensor:
         hidden_states = self.ln(hidden_states)
         hidden_states = self.fc(hidden_states)
         hidden_states = self.act(hidden_states)
diff --git a/server/text_generation_server/models/custom_modeling/idefics_processing.py b/server/text_generation_server/models/custom_modeling/idefics_processing.py
index e24fc7bd..7bba6977 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_processing.py
@@ -21,9 +21,16 @@ from urllib.parse import urlparse
 
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from transformers.tokenization_utils_base import (
+    BatchEncoding,
+    PaddingStrategy,
+    TextInput,
+    TruncationStrategy,
+)
 from transformers.utils import TensorType, is_torch_available
-from text_generation_server.models.custom_modeling.idefics_image_processing import IdeficsImageProcessor
+from text_generation_server.models.custom_modeling.idefics_image_processing import (
+    IdeficsImageProcessor,
+)
 
 
 if is_torch_available():
@@ -106,6 +113,12 @@ def is_url(string):
     return all([result.scheme, result.netloc])
 
 
+def is_image(string):
+    """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url"""
+    return is_url(string) or string.startswith("data:")
+
+
 class IdeficsProcessor(ProcessorMixin):
     r"""
     Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
@@ -120,11 +133,19 @@ class IdeficsProcessor(ProcessorMixin):
             An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
         image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
     """
+
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "IdeficsImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
 
-    def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+    def __init__(
+        self,
+        image_processor,
+        tokenizer=None,
+        image_size=224,
+        add_end_of_utterance_token=None,
+        **kwargs,
+    ):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
@@ -142,7 +163,8 @@ class IdeficsProcessor(ProcessorMixin):
 
         self.tokenizer_was_trained_with_end_of_utterance_token = (
             True
-            if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
+            if "<end_of_utterance>"
+            in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
             else False
         )
 
@@ -265,7 +287,9 @@ class IdeficsProcessor(ProcessorMixin):
 
         # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
-            add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
+            add_end_of_utterance_token = (
+                self.tokenizer_was_trained_with_end_of_utterance_token
+            )
 
         # turn non-batched prompts into batched
         if not any(isinstance(i, list) for i in prompts):
@@ -297,7 +321,7 @@ class IdeficsProcessor(ProcessorMixin):
 
                 if isinstance(item, str):
                     item = item.strip(" ")
-                    if is_url(item):
+                    if is_image(item):
                         image = self.image_processor.fetch_images(item)
                         full_text += image_tokens(last_was_image)
                         image_objects.append(image)
@@ -358,10 +382,14 @@ class IdeficsProcessor(ProcessorMixin):
             current_images = images[:local_max_num_images]
 
             if len(current_images) > 0:
-                padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
+                padded_image_tensor = torch.zeros(
+                    max_num_images, *current_images.size()[1:]
+                )
                 padded_image_tensor[: current_images.size(0)] = current_images
             else:
-                padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
+                padded_image_tensor = torch.zeros(
+                    max_num_images, *self.default_image_dims
+                )
 
             output_images.append(padded_image_tensor)
             output_input_ids.append(torch.tensor(padded_input_ids))
@@ -373,14 +401,19 @@ class IdeficsProcessor(ProcessorMixin):
         output_attention_masks = torch.stack(output_attention_masks)
 
         if at_least_one_image:
-            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(output_input_ids, self.tokenizer)
+            image_attention_mask, _ = image_attention_mask_for_packed_input_ids(
+                output_input_ids, self.tokenizer
+            )
             image_attention_mask = incremental_to_binary_attention_mask(
                 image_attention_mask, num_classes=max_num_images
             )
         else:
             # in full language mode we set the image mask to all-0s
             image_attention_mask = torch.zeros(
-                output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
+                output_input_ids.shape[0],
+                output_input_ids.shape[1],
+                1,
+                dtype=torch.bool,
             )
 
         return BatchFeature(
diff --git a/server/text_generation_server/models/custom_modeling/idefics_vision.py b/server/text_generation_server/models/custom_modeling/idefics_vision.py
index d933d7c1..30c5997f 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_vision.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_vision.py
@@ -28,7 +28,7 @@ from transformers.utils import (
     ModelOutput,
     logging,
 )
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
     TensorParallelEmbedding,
@@ -75,7 +75,9 @@ class IdeficsVisionEmbeddings(nn.Module):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.class_embedding = nn.Parameter(weights.get_tensor(f"{prefix}.class_embedding"))
+        self.class_embedding = nn.Parameter(
+            weights.get_tensor(f"{prefix}.class_embedding")
+        )
 
         self.patch_embedding = nn.Conv2d.load_no_bias(
             prefix=f"{prefix}.patch_embedding",
@@ -88,17 +90,19 @@ class IdeficsVisionEmbeddings(nn.Module):
 
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
-        # self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
         self.position_embedding = TensorParallelEmbedding(
             prefix="model.vision_model.embeddings.position_embedding", weights=weights
         )
-        # self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
-        self.position_ids = weights.get_tensor(f"{prefix}.position_ids")
+        self.position_ids = (
+            torch.arange(self.num_positions).expand((1, -1)).to(device=weights.device)
+        )
 
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -134,7 +138,6 @@ class IdeficsVisionAttention(nn.Module):
         self.num_heads = self.num_heads // weights.process_group.size()
         self.embed_dim = self.embed_dim // weights.process_group.size()
 
-
         self.k_proj = TensorParallelColumnLinear.load(
             config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
         )
@@ -149,7 +152,11 @@ class IdeficsVisionAttention(nn.Module):
         )
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
 
     def forward(
         self,
@@ -188,7 +195,10 @@ class IdeficsVisionAttention(nn.Module):
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                     f" {causal_attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if attention_mask is not None:
@@ -196,7 +206,10 @@ class IdeficsVisionAttention(nn.Module):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
@@ -206,12 +219,18 @@ class IdeficsVisionAttention(nn.Module):
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
         else:
             attn_weights_reshaped = None
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
 
         attn_output = torch.bmm(attn_probs, value_states)
 
@@ -255,11 +274,15 @@ class IdeficsVisionEncoderLayer(nn.Module):
     def __init__(self, prefix, config, weights):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = IdeficsVisionAttention(prefix=f"{prefix}.self_attn", config=config, weights=weights)
+        self.self_attn = IdeficsVisionAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
         self.layer_norm1 = nn.LayerNorm.load(
             prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
         )
-        self.mlp = IdeficsVisionMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.mlp = IdeficsVisionMLP(
+            prefix=f"{prefix}.mlp", config=config, weights=weights
+        )
         self.layer_norm2 = nn.LayerNorm.load(
             prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
         )
@@ -320,7 +343,11 @@ class IdeficsVisionEncoder(nn.Module):
         self.config = config
         self.layers = nn.ModuleList(
             [
-                IdeficsVisionEncoderLayer(prefix=f"{prefix}.encoder.layers.{layer_id}", config=config, weights=weights)
+                IdeficsVisionEncoderLayer(
+                    prefix=f"{prefix}.encoder.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                )
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
@@ -364,11 +391,19 @@ class IdeficsVisionEncoder(nn.Module):
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -408,9 +443,15 @@ class IdeficsVisionEncoder(nn.Module):
             encoder_states = encoder_states + (hidden_states,)
 
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
 
 
@@ -421,13 +462,19 @@ class IdeficsVisionTransformer(nn.Module):
         self.config = config
         embed_dim = config.hidden_size
 
-        self.embeddings = IdeficsVisionEmbeddings(prefix=f"{prefix}.embeddings", config=config, weights=weights)
+        self.embeddings = IdeficsVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
         self.pre_layrnorm = nn.LayerNorm.load(
             prefix=f"{prefix}.pre_layrnorm", weights=weights, eps=config.layer_norm_eps
         )
-        self.encoder = IdeficsVisionEncoder(prefix=prefix, config=config, weights=weights)
+        self.encoder = IdeficsVisionEncoder(
+            prefix=prefix, config=config, weights=weights
+        )
         self.post_layernorm = nn.LayerNorm.load(
-            prefix=f"{prefix}.post_layernorm", weights=weights, eps=config.layer_norm_eps
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
         )
 
     # copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward
@@ -442,11 +489,19 @@ class IdeficsVisionTransformer(nn.Module):
         Returns:
 
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
new file mode 100644
index 00000000..567131ef
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Llava-NeXT model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import select_best_resolution
+
+from text_generation_server.models.custom_modeling.vlm import (
+    load_text_model,
+    load_vision_model,
+)
+from text_generation_server.layers import (
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (height, width).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (height, width).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise ValueError("grid_pinpoints should be a list of tuples or lists")
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+        tensor (`torch.Tensor`):
+            The image tensor, assumed to be of shape (num_channels, height, width).
+        original_size (`tuple`):
+            The original size of the image (height, width).
+
+    Returns:
+        `torch.Tensor`: The unpadded image tensor.
+    """
+    original_height, original_width = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(original_height * scale_factor)
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(original_width * scale_factor)
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
+# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
+class LlavaNextMultiModalProjector(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.linear_1 = TensorParallelColumnLinear.load(
+            prefix=f"{prefix}.linear_1", config=config, weights=weights, bias=True
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = TensorParallelRowLinear.load(
+            prefix=f"{prefix}.linear_2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class LlavaNextForConditionalGeneration(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        config.vision_config.quantize = config.quantize
+        vision_config = config.vision_config
+        # Instead of selecting in hidden_states[-2].
+        # Instead compute only the n -2 + 1 layers and don't pool
+        if config.vision_feature_layer < 0:
+            vision_config.num_hidden_layers += config.vision_feature_layer + 1
+        else:
+            vision_config.num_hidden_layers = config.vision_feature_layer + 1
+        self.vision_tower = load_vision_model(
+            prefix="vision_tower" if not prefix else f"{prefix}.vision_tower",
+            config=config.vision_config,
+            weights=weights,
+        )
+
+        self.multi_modal_projector = LlavaNextMultiModalProjector(
+            prefix="multi_modal_projector", config=config, weights=weights
+        )
+
+        self.image_newline = weights.get_tensor("image_newline")
+
+        self.vocab_size = config.text_config.vocab_size
+        self.config = config
+        config.text_config.quantize = config.quantize
+        config.text_config.speculator = config.speculator
+        self.text_model = load_text_model(
+            prefix="language_model" if not prefix else f"{prefix}.language_model",
+            config=config.text_config,
+            weights=weights,
+        )
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+
+    def _merge_input_ids_with_image_features(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_features: torch.Tensor,
+    ):
+        """In place merges in vision_embeddings with inputs_embeds."""
+        mask = input_ids == self.config.image_token_index
+        # Let's pray we have enabled enough slots !
+        try:
+            inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1])
+        except Exception as e:
+            raise RuntimeError(
+                f"Cannot fill images right now. If error happens at warmup, make sure you have enough `--max-input-tokens`  to handle images. If error happens at regular runtime, please fill in an issue: {e}"
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        pixel_values: torch.FloatTensor = None,
+        # Unused for this model
+        pixel_attention_mask=None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ):
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
+        if pixel_values is not None and len(pixel_values) > 0:
+            # num_special_image_tokens = (input_ids == self.config.image_token_index).sum()
+            # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid"
+            # 1. Extract the input embeddings
+
+            # 2. Merge text and images
+            num_images, num_patches, channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.view(
+                num_images * num_patches, channels, height, width
+            )
+            image_features = self.vision_tower(pixel_values)
+
+            # selected_image_feature = image_features.hidden_states[self.config.vision_feature_layer]
+            # Already done within the clip model
+            selected_image_feature = image_features.last_hidden_state
+
+            if self.config.vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif self.config.vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise RuntimeError(
+                    f"Strategy `{self.config.vision_feature_select_strategy}` is not supported/valid."
+                )
+
+            image_features = self.multi_modal_projector(selected_image_feature)
+
+            # split up image_features for each of the individual images
+            # hence we get a list of image_features, each of shape (5, num_patches, hidden_size)
+            # if we assume each image has 5 image features (base image + 4 patches)
+            split_sizes = [num_patches] * num_images
+            image_features = torch.split(image_features, split_sizes, dim=0)
+
+            # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+            height = width = (
+                self.config.vision_config.image_size
+                // self.config.vision_config.patch_size
+            )
+
+            new_image_features = []
+            for image_idx, image_feature in enumerate(image_features):
+                if image_feature.shape[0] > 1:
+                    base_image_feature = image_feature[0]
+                    image_feature = image_feature[1:]
+
+                    if height * width != base_image_feature.shape[0]:
+                        raise ValueError(
+                            "The number of patches is not consistent with the image size."
+                        )
+
+                    # Dimensions are intentionally swapped to be bug-compatible with
+                    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
+                    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                        image_sizes[image_idx],
+                        self.config.image_grid_pinpoints,
+                        self.config.vision_config.image_size,
+                    )
+                    image_feature = image_feature.view(
+                        num_patch_height, num_patch_width, height, width, -1
+                    )
+                    image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                    image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                    image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            self.image_newline[:, None, None].expand(
+                                *image_feature.shape[:-1], 1
+                            ),
+                        ),
+                        dim=-1,
+                    )
+                    image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                    image_feature = torch.cat(
+                        (base_image_feature, image_feature), dim=0
+                    )
+                else:
+                    image_feature = image_feature[0]
+                    image_feature = torch.cat(
+                        (image_feature, self.image_newline[None]), dim=0
+                    )
+                new_image_features.append(image_feature)
+            image_features = torch.stack(new_image_features, dim=0)
+
+            inputs_embeds = self._merge_input_ids_with_image_features(
+                input_ids, inputs_embeds, image_features
+            )
+
+        hidden_states = self.text_model.model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlen_prefill=cu_seqlen_prefill,
+            kv_cache=kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            input_lengths=input_lengths,
+            max_s=max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=None,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
new file mode 100644
index 00000000..293051c2
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -0,0 +1,232 @@
+import torch
+import torch.distributed
+
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn
+from torch import nn
+from typing import Optional, Tuple, Any
+from transformers.configuration_utils import PretrainedConfig
+import torch.nn.functional as F
+
+from text_generation_server.layers import (
+    SpeculativeHead,
+    TensorParallelEmbedding,
+    FastLinear,
+)
+from text_generation_server.layers.layernorm import FastRMSNorm
+
+from einops import rearrange
+from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+import math
+from dataclasses import dataclass
+
+
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    max_seqlen: int
+    max_batch_size: int
+    conv_states: torch.Tensor
+    ssm_states: torch.Tensor
+    seqlen_offset: int
+
+
+class MambaConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=50280,
+        d_model=768,
+        d_state=16,
+        n_layer=32,
+        layer_norm_epsilon=1e-5,
+        tie_word_embeddings=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        expand=2,
+        dt_rank="auto",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.d_model = d_model
+        self.d_inner = d_model * 2
+        self.d_conv = 4
+        self.d_state = d_state
+        self.expand = expand
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class MambaBlock(nn.Module):
+    def __init__(self, prefix, config, weights, layer_id):
+        super().__init__()
+        self.layer_id = layer_id
+        self.in_proj = FastLinear.load(config, f"{prefix}.in_proj", weights, bias=False)
+        self.x_proj = FastLinear.load(config, f"{prefix}.x_proj", weights, bias=False)
+        self.dt_proj = FastLinear.load(config, f"{prefix}.dt_proj", weights, bias=True)
+        self.dt_proj_no_bias = FastLinear.load(
+            config, f"{prefix}.dt_proj", weights, bias=False
+        )
+        self.out_proj = FastLinear.load(
+            config, f"{prefix}.out_proj", weights, bias=False
+        )
+        self.conv1d = FastLinear.load(config, f"{prefix}.conv1d", weights, bias=True)
+        self.negA = -torch.exp(weights.get_tensor(f"{prefix}.A_log").float())
+        self.D = weights.get_tensor(f"{prefix}.D")
+        self.activation = "silu"
+        self.dt_rank = config.dt_rank
+        self.d_state = config.d_state
+        self.d_conv = config.d_conv
+        self.act = nn.SiLU()
+
+    # inference_params
+    def forward(self, hidden_states: torch.Tensor, inference_params=None):
+        if inference_params.seqlen_offset > 0:
+            conv_state = inference_params.conv_states[self.layer_id]
+            ssm_state = inference_params.ssm_states[self.layer_id]
+            out, conv_state, ssm_state = self.step(hidden_states, conv_state, ssm_state)
+            return out, conv_state, ssm_state
+
+        _, seqlen, _ = hidden_states.shape
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+        # assert projected_states.shape == [batch_size, 2 * dstate, seqlen], f"{projected_states.shape} [{batch_size}, {dstate}, {seqlen}]"
+        x, z = projected_states.chunk(2, dim=1)
+        conv_state = F.pad(x, (self.d_conv - seqlen, 0))
+        x = causal_conv1d_fn(
+            x=x,
+            weight=self.conv1d.weight.squeeze(1),
+            bias=self.conv1d.bias,
+            activation=self.activation,
+        )
+
+        # We're careful here about the layout, to avoid extra transposes.
+        # We want dt to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+        dt, B, C = torch.split(
+            x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
+        )
+        dt = self.dt_proj.weight @ dt.t()
+        dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+        B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+        C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+        y, last_state = selective_scan_fn(
+            x,
+            dt,
+            self.negA,
+            B,
+            C,
+            self.D.float(),
+            z=z,
+            delta_bias=self.dt_proj.bias.float(),
+            delta_softplus=True,
+            return_last_state=True,
+        )
+        y = rearrange(y, "b d l -> b l d")
+        attn_outputs = self.out_proj(y)
+        return attn_outputs, conv_state, last_state
+
+    def step(self, hidden_states, conv_state, ssm_state):
+        xz = self.in_proj(hidden_states.squeeze(1))
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        x = causal_conv1d_update(
+            x,
+            conv_state,
+            self.conv1d.weight.squeeze(1),
+            self.conv1d.bias,
+            self.activation,
+        )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        dt = F.linear(dt, self.dt_proj.weight)
+        A = self.negA
+        y = selective_state_update(
+            ssm_state,
+            x,
+            dt,
+            A,
+            B,
+            C,
+            self.D,
+            z=z,
+            dt_bias=self.dt_proj.bias,
+            dt_softplus=True,
+        )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state.clone(), ssm_state.clone()
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, prefix, config, weights, layer_id):
+        super().__init__()
+        self.mamba_block = MambaBlock(
+            prefix=f"{prefix}.mixer", config=config, weights=weights, layer_id=layer_id
+        )
+        self.layer_norm = FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        inference_params: Optional[Any] = None,
+    ):
+        residual = (hidden_states + residual) if residual is not None else hidden_states
+        shape = residual.shape
+        hidden_states, _ = self.layer_norm(residual.view(-1, shape[-1]))
+        hidden_states, conv_state, last_ssm_state = self.mamba_block(
+            hidden_states.view(*shape), inference_params
+        )
+        return hidden_states, residual, conv_state, last_ssm_state
+
+
+class MambaModel(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+        prefix = "backbone"
+        self.embed_tokens = TensorParallelEmbedding(f"{prefix}.embedding", weights)
+        self.blocks = nn.ModuleList(
+            [
+                ResidualBlock(f"{prefix}.layers.{i}", config, weights, layer_id=i)
+                for i in range(config.n_layer)
+            ]
+        )
+        self.norm_f = FastRMSNorm.load(
+            f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
+        )
+        self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
+        self.config = config
+
+    def forward(
+        self, input_ids: torch.Tensor, inference_params=None, residual=None
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.embed_tokens(input_ids)
+        for i, block in enumerate(self.blocks):
+            hidden_states, residual, conv_state, ssm_state = block(
+                hidden_states, residual, inference_params
+            )
+            inference_params.conv_states[i].copy_(conv_state)
+            inference_params.ssm_states[i].copy_(ssm_state)
+
+        hidden_states = (
+            hidden_states + residual if residual is not None else hidden_states
+        )
+        hidden_states, _ = self.norm_f(hidden_states.view(-1, hidden_states.size(-1)))
+        hidden_states = hidden_states.view(residual.shape)
+        logits, speculative_logits = self.lm_head(hidden_states)
+
+        # update the offset for the next inference using these params
+        inference_params.seqlen_offset += input_ids.size(1)
+        return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
index 5ccf796d..fb09a8f1 100644
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -2,6 +2,7 @@
 
 Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
 """
+
 import math
 import os
 import warnings
@@ -16,11 +17,11 @@ from transformers.modeling_outputs import (
 )
 from einops import rearrange
 from packaging import version
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelEmbedding,
     TensorParallelColumnLinear,
     TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
     get_linear,
 )
 
@@ -28,7 +29,6 @@ EPS = 1e-5
 
 
 def load_col(config, prefix, weights, bias):
-    assert bias == False, NotImplementedError
     assert config.quantize != "gptq", NotImplementedError
     slice_ = weights._get_slice(f"{prefix}.weight")
     rank = weights.process_group.rank()
@@ -45,7 +45,36 @@ def load_col(config, prefix, weights, bias):
     if weight.dtype != torch.int32:
         weight = weight.to(dtype=weights.dtype)
     weight = weight.to(device=weights.device)
-    bias = None
+
+    if bias:
+        bias_slice_ = weights._get_slice(f"{prefix}.bias")
+        bias_rank = weights.process_group.rank()
+        bias_size = weights.process_group.size()
+
+        bias_h = bias_slice_.get_shape()
+        bias_h = bias_h[0]
+        bias_block_size = bias_h // bias_size
+
+        bias_q_part = bias_slice_[
+            bias_rank * bias_block_size : (bias_rank + 1) * bias_block_size
+        ]
+        bias_k_part = bias_slice_[
+            bias_h
+            + bias_rank * bias_block_size : bias_h
+            + (bias_rank + 1) * bias_block_size
+        ]
+        bias_v_part = bias_slice_[
+            2 * bias_h
+            + bias_rank * bias_block_size : 2 * bias_h
+            + (bias_rank + 1) * bias_block_size
+        ]
+
+        bias = torch.cat([bias_q_part, bias_k_part, bias_v_part], dim=0)
+        if bias.dtype != torch.int32:
+            bias = bias.to(dtype=weights.dtype)
+        bias = bias.to(device=weights.device)
+    else:
+        bias = None
     linear = get_linear(weight, bias, config.quantize)
     return TensorParallelColumnLinear(linear)
 
@@ -330,7 +359,16 @@ class MultiheadAttention(nn.Module):
             config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
         )
         if self.qk_ln:
-            raise NotImplementedError("qk_ln is not supported")
+            bias = not config.no_bias
+            hidden_size = config.d_model
+            head_dim = hidden_size // self.n_heads
+
+            self.q_ln = LPLayerNorm(
+                d_model, bias=bias, prefix=f"{prefix}.q_ln", weights=weights
+            )
+            self.k_ln = LPLayerNorm(
+                self.n_heads * head_dim, prefix=f"{prefix}.k_ln", weights=weights
+            )
         if self.attn_impl == "flash":
             self.attn_fn = flash_attn_fn
         elif self.attn_impl == "triton":
@@ -581,12 +619,20 @@ class MPTBlock(nn.Module):
                 f"""Not implemented attn {config.attn_config["attn_type"]}"""
             )
         resid_pdrop = config.resid_pdrop
-        self.norm_1 = nn.LayerNorm.load_no_bias(
-            prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
-        )
-        self.norm_2 = nn.LayerNorm.load_no_bias(
-            prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
-        )
+        if config.no_bias:
+            self.norm_1 = nn.LayerNorm.load_no_bias(
+                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
+            )
+            self.norm_2 = nn.LayerNorm.load_no_bias(
+                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
+            )
+        else:
+            self.norm_1 = nn.LayerNorm.load(
+                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
+            )
+            self.norm_2 = nn.LayerNorm.load(
+                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
+            )
         self.attn = MultiheadAttention(config, prefix=f"{prefix}.attn", weights=weights)
         self.ffn = MPTMLP(config, prefix=f"{prefix}.ffn", weights=weights)
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
@@ -635,6 +681,9 @@ class LPLayerNorm(torch.nn.LayerNorm):
         elementwise_affine=True,
         device=None,
         dtype=None,
+        bias: Optional[bool] = True,
+        prefix=None,
+        weights=None,
     ):
         super().__init__(
             normalized_shape=normalized_shape,
@@ -642,7 +691,13 @@ class LPLayerNorm(torch.nn.LayerNorm):
             elementwise_affine=elementwise_affine,
             device=device,
             dtype=dtype,
+            bias=bias,
         )
+        if weights is not None:
+            self.weight = nn.Parameter(weights.get_sharded(f"{prefix}.weight", dim=0))
+            if bias:
+                self.bias = nn.Parameter(weights.get_sharded(f"{prefix}.bias", dim=0))
+            self.normalized_shape = self.weight.shape
 
     def forward(self, x):
         module_device = x.device
@@ -728,7 +783,7 @@ class MPTPreTrainedModel(PreTrainedModel):
 
 
 class MPTModel(MPTPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         # config._validate_config()
         super().__init__(config)
         self.world_size = weights.process_group.size()
@@ -754,21 +809,24 @@ class MPTModel(MPTPreTrainedModel):
                 f"Requested norm type ({config.norm_type}) is not implemented within this repo."
             )
 
-        self.wte = TensorParallelEmbedding("transformer.wte", weights)
+        self.wte = TensorParallelEmbedding(f"{prefix}.wte", weights)
+
         if not self.alibi:
-            # self.wpe = torch.nn.Embedding(
-            #     config.max_seq_len, config.d_model, device=config.init_device
-            # )
-            raise RuntimeError("no alibi no supported")
+            self.wpe = TensorParallelEmbedding(f"{prefix}.wpe", weights)
         self.blocks = nn.ModuleList(
             [
-                MPTBlock(config, prefix=f"transformer.blocks.{i}", weights=weights)
+                MPTBlock(config, prefix=f"{prefix}.blocks.{i}", weights=weights)
                 for i in range(config.n_layers)
             ]
         )
-        self.norm_f = nn.LayerNorm.load_no_bias(
-            prefix="transformer.norm_f", weights=weights, eps=EPS
-        )
+        if config.no_bias:
+            self.norm_f = nn.LayerNorm.load_no_bias(
+                prefix="transformer.norm_f", weights=weights, eps=EPS
+            )
+        else:
+            self.norm_f = nn.LayerNorm.load(
+                prefix="transformer.norm_f", weights=weights, eps=EPS
+            )
         self.is_causal = not self.prefix_lm
         self._attn_bias_initialized = False
         self.attn_bias = None
@@ -787,8 +845,9 @@ class MPTModel(MPTPreTrainedModel):
                     if config.verbose:
                         warnings.warn(f"Removing bias ({module.bias}) from {module}.")
                     module.register_parameter("bias", None)
-        if config.verbose and config.verbose > 2:
-            print(self)
+        if hasattr(self.config, "verbose"):
+            if config.verbose and config.verbose > 2:
+                print(self)
         if "verbose" not in self.config.init_config:
             self.config.init_config["verbose"] = self.config.verbose
         if self.config.init_config["verbose"] > 1:
@@ -1026,13 +1085,19 @@ class MPTModel(MPTPreTrainedModel):
 
 
 class MPTForCausalLM(MPTPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
         if not config.tie_word_embeddings:
             raise ValueError("MPTForCausalLM only supports tied word embeddings")
-        self.transformer = MPTModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
-            config, prefix="transformer.wte", weights=weights
+        self.transformer = MPTModel(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config, prefix=f"{prefix}.wte", weights=weights
         )
         self.logit_scale = None
         if config.logit_scale is not None:
@@ -1074,7 +1139,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
             output_hidden_states=output_hidden_states,
             use_cache=use_cache,
         )
-        logits = self.lm_head(outputs.last_hidden_state)
+        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
         if self.logit_scale is not None:
             if self.logit_scale == 0:
                 warnings.warn(
@@ -1088,12 +1153,15 @@ class MPTForCausalLM(MPTPreTrainedModel):
             loss = F.cross_entropy(
                 logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
             )
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
+        return (
+            CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            ),
+            speculative_logits,
         )
 
     def prepare_inputs_for_generation(
diff --git a/server/text_generation_server/models/custom_modeling/neox_modeling.py b/server/text_generation_server/models/custom_modeling/neox_modeling.py
index 1951b171..8998778f 100644
--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -40,16 +40,19 @@ from transformers.modeling_outputs import (
 from transformers.modeling_utils import PreTrainedModel
 from transformers import GPTNeoXConfig
 from loguru import logger
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
 
 
 CUSTOM_KERNELS_ENABLED = False
-if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
+if (
+    torch.cuda.is_available()
+    and not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True"
+):
     try:
         from custom_kernels import fused_attention_cuda
 
@@ -57,9 +60,6 @@ if not os.environ.get("DISABLE_CUSTOM_KERNELS", "False") == "True":
     except ImportError:
         pass
 
-if not CUSTOM_KERNELS_ENABLED:
-    logger.warning("We're not using custom kernels.")
-
 
 def make_causal_mask(
     input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
@@ -280,10 +280,10 @@ class GPTNeoXAttention(nn.Module):
         batch_size, num_attention_heads, query_length, attn_head_size = query.size()
         key_length = key.size(-2)
 
-        query = query.view(
+        query = query.reshape(
             batch_size * num_attention_heads, query_length, attn_head_size
         )
-        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+        key = key.reshape(batch_size * num_attention_heads, key_length, attn_head_size)
         attn_scores = torch.zeros(
             1,
             dtype=query.dtype,
@@ -404,24 +404,24 @@ class GPTNeoXMLP(nn.Module):
 
 
 class GPTNeoXLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, layer_id, prefix: str, config, weights):
         super().__init__()
         self.use_parallel_residual = config.use_parallel_residual
         self.input_layernorm = nn.LayerNorm.load(
-            prefix=f"gpt_neox.layers.{layer_id}.input_layernorm",
+            prefix=f"{prefix}.layers.{layer_id}.input_layernorm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
         self.post_attention_layernorm = nn.LayerNorm.load(
-            prefix=f"gpt_neox.layers.{layer_id}.post_attention_layernorm",
+            prefix=f"{prefix}.layers.{layer_id}.post_attention_layernorm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
         self.attention = GPTNeoXAttention(
-            config, prefix=f"gpt_neox.layers.{layer_id}.attention", weights=weights
+            config, prefix=f"{prefix}.layers.{layer_id}.attention", weights=weights
         )
         self.mlp = GPTNeoXMLP(
-            config, prefix=f"gpt_neox.layers.{layer_id}.mlp", weights=weights
+            config, prefix=f"{prefix}.layers.{layer_id}.mlp", weights=weights
         )
 
     def forward(
@@ -472,23 +472,23 @@ class GPTNeoXLayer(nn.Module):
 
 
 class GPTNeoXModel(GPTNeoXPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
         self.config = config
 
         self.num_attention_heads = config.num_attention_heads
 
         self.embed_in = TensorParallelEmbedding(
-            prefix="gpt_neox.embed_in", weights=weights
+            prefix=f"{prefix}.embed_in", weights=weights
         )
         self.layers = nn.ModuleList(
             [
-                GPTNeoXLayer(layer_id, config, weights)
+                GPTNeoXLayer(layer_id, prefix, config, weights)
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
         self.final_layer_norm = nn.LayerNorm.load(
-            prefix="gpt_neox.final_layer_norm",
+            prefix=f"{prefix}.final_layer_norm",
             weights=weights,
             eps=config.layer_norm_eps,
         )
@@ -640,10 +640,16 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
 class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
-    def __init__(self, config, weights):
+    def __init__(self, prefix: str, config, weights):
         super().__init__(config)
-        self.gpt_neox = GPTNeoXModel(config, weights)
-        self.embed_out = TensorParallelHead.load(
+
+        if not prefix:
+            prefix = "gpt_neox"
+        else:
+            prefix = f"{prefix}.gpt_neox"
+
+        self.gpt_neox = GPTNeoXModel(prefix, config, weights)
+        self.embed_out = SpeculativeHead.load(
             config, prefix="embed_out", weights=weights
         )
 
@@ -718,7 +724,7 @@ class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
         )
 
         hidden_states = outputs[0]
-        lm_logits = self.embed_out(hidden_states)
+        lm_logits, speculative_logits = self.embed_out(hidden_states)
 
         lm_loss = None
         if labels is not None:
@@ -736,12 +742,15 @@ class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
             output = (lm_logits,) + outputs[1:]
             return ((lm_loss,) + output) if lm_loss is not None else output
 
-        return CausalLMOutputWithPast(
-            loss=lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
+        return (
+            CausalLMOutputWithPast(
+                loss=lm_loss,
+                logits=lm_logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            ),
+            speculative_logits,
         )
 
     def prepare_inputs_for_generation(
diff --git a/server/text_generation_server/models/custom_modeling/opt_modeling.py b/server/text_generation_server/models/custom_modeling/opt_modeling.py
index fe6f1e52..5ab02959 100644
--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -27,12 +27,12 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers import OPTConfig
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     FastLinear,
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
 
 EPS = 1e-5
@@ -94,11 +94,11 @@ class OPTLearnedPositionalEmbedding(nn.Module):
     This module learns positional embeddings up to a fixed maximum size.
     """
 
-    def __init__(self, weights):
+    def __init__(self, prefix: str, weights):
         super().__init__()
         self.offset = 2
         self.weight = nn.Parameter(
-            weights.get_tensor("model.decoder.embed_positions.weight")
+            weights.get_tensor(f"{prefix}.decoder.embed_positions.weight")
         )
 
     def forward(
@@ -311,11 +311,11 @@ class OPTAttention(nn.Module):
 
 
 class OPTDecoderLayer(nn.Module):
-    def __init__(self, layer_id: int, config: OPTConfig, weights):
+    def __init__(self, layer_id: int, prefix: str, config: OPTConfig, weights):
         super().__init__()
         self.process_group = weights.process_group
         self.hidden_size = config.hidden_size
-        prefix = f"model.decoder.layers.{layer_id}"
+        prefix = f"{prefix}.decoder.layers.{layer_id}"
         self.self_attn = OPTAttention(
             config,
             prefix=f"{prefix}.self_attn",
@@ -429,7 +429,7 @@ class OPTPreTrainedModel(PreTrainedModel):
 
 
 class OPTDecoder(OPTPreTrainedModel):
-    def __init__(self, config: OPTConfig, weights):
+    def __init__(self, prefix: str, config: OPTConfig, weights):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.layerdrop
@@ -438,20 +438,26 @@ class OPTDecoder(OPTPreTrainedModel):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.decoder.embed_tokens", weights=weights
+            prefix=f"{prefix}.decoder.embed_tokens", weights=weights
         )
-        self.embed_positions = OPTLearnedPositionalEmbedding(weights)
+        self.embed_positions = OPTLearnedPositionalEmbedding(prefix, weights)
 
         if config.word_embed_proj_dim != config.hidden_size:
             self.project_out = FastLinear.load(
-                config, prefix="model.decoder.project_out", bias=False
+                config,
+                prefix=f"{prefix}.decoder.project_out",
+                weights=weights,
+                bias=False,
             )
         else:
             self.project_out = None
 
         if config.word_embed_proj_dim != config.hidden_size:
             self.project_in = FastLinear.load(
-                config, prefix="model.decoder.project_in", bias=False
+                config,
+                prefix=f"{prefix}.decoder.project_in",
+                weights=weights,
+                bias=False,
             )
         else:
             self.project_in = None
@@ -461,14 +467,14 @@ class OPTDecoder(OPTPreTrainedModel):
         # see https://github.com/facebookresearch/metaseq/pull/164
         if config.do_layer_norm_before and not config._remove_final_layer_norm:
             self.final_layer_norm = nn.LayerNorm.load(
-                prefix="model.decoder.final_layer_norm", weights=weights, eps=EPS
+                prefix=f"{prefix}.decoder.final_layer_norm", weights=weights, eps=EPS
             )
         else:
             self.final_layer_norm = None
 
         self.layers = nn.ModuleList(
             [
-                OPTDecoderLayer(layer_id, config, weights)
+                OPTDecoderLayer(layer_id, prefix, config, weights)
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
@@ -686,9 +692,9 @@ class OPTDecoder(OPTPreTrainedModel):
 
 
 class OPTModel(OPTPreTrainedModel):
-    def __init__(self, config: OPTConfig, weights):
+    def __init__(self, prefix: str, config: OPTConfig, weights):
         super().__init__(config)
-        self.decoder = OPTDecoder(config, weights)
+        self.decoder = OPTDecoder(prefix, config, weights)
         # Initialize weights and apply final processing
 
     def forward(
@@ -743,13 +749,18 @@ class OPTModel(OPTPreTrainedModel):
 
 
 class OPTForCausalLM(OPTPreTrainedModel):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
         super().__init__(config)
 
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
         self.model = OPTModel(config, weights)
 
-        self.lm_head = TensorParallelHead.load(
-            config, prefix="model.decoder.embed_tokens", weights=weights
+        self.lm_head = SpeculativeHead.load(
+            config, prefix=f"{prefix}.decoder.embed_tokens", weights=weights
         )
 
     def forward(
@@ -792,16 +803,19 @@ class OPTForCausalLM(OPTPreTrainedModel):
             return_dict=return_dict,
         )
 
-        logits = self.lm_head(outputs[0]).contiguous()
+        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
 
         loss = None
 
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
+        return (
+            CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            ),
+            speculative_logits,
         )
 
     def prepare_inputs_for_generation(
diff --git a/server/text_generation_server/models/custom_modeling/phi_modeling.py b/server/text_generation_server/models/custom_modeling/phi_modeling.py
new file mode 100644
index 00000000..b4d56db1
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -0,0 +1,336 @@
+# imlementation of the PhiModel and PhiForCausalLM classes
+
+import torch
+import torch.distributed
+
+import math
+from torch import nn
+from typing import Optional, List, Tuple, Any
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    FastLinear,
+)
+
+
+# PhiConfig is the configuration class for the PhiModel.
+class PhiConfig(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=51200,
+        n_positions=2048,
+        n_embd=2560,
+        n_layer=32,
+        n_inner=None,
+        n_head=32,
+        rotary_dim=32,
+        layer_norm_epsilon=1e-5,
+        tie_word_embeddings=False,
+        pad_vocab_size_multiple=64,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        no_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_inner = n_inner
+        self.n_head = n_head
+        self.rotary_dim = rotary_dim
+
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.tie_word_embeddings = tie_word_embeddings
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.no_bias = no_bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+# RotaryEmbedding is a class that implements the rotary embedding.
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        inv_freq = [1.0 / 10000.0 ** (i / dim) for i in range(0, dim, 2)]
+        inv_freq_len = len(inv_freq)
+        inv_freq = torch.tensor(inv_freq).view(1, inv_freq_len)
+        t = torch.arange(0, max_seq_len, dtype=torch.float).view(max_seq_len, 1)
+        freqs = t.matmul(inv_freq)
+        self.sin = freqs.sin()
+        self.cos = freqs.cos()
+
+    def apply_rotary_emb_qkv(self, qkv, seqlen_offset):
+        b_size, seqlen, three, _, _headdim = qkv.shape
+        if three != 3:
+            raise Exception("unexpected shape for qkv")
+        _, rotary_dim = self.cos.shape
+        rotary_dim = rotary_dim * 2
+        q_rot = qkv[:, :, 0, :, :rotary_dim]
+        q_pass = qkv[:, :, 0, :, rotary_dim:]
+        k_rot = qkv[:, :, 1, :, :rotary_dim]
+        k_pass = qkv[:, :, 1, :, rotary_dim:]
+        q12 = torch.chunk(q_rot, 2, dim=-1)
+        k12 = torch.chunk(k_rot, 2, dim=-1)
+        q1, q2 = q12[0], q12[1]
+        k1, k2 = k12[0], k12[1]
+        c = self.cos.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
+        s = self.sin.narrow(0, seqlen_offset, seqlen).unsqueeze(1)
+        q_rot = torch.cat(
+            [
+                q1 * c - q2 * s,
+                q1 * s + q2 * c,
+            ],
+            dim=-1,
+        )
+        k_rot = torch.cat(
+            [
+                k1 * c - k2 * s,
+                k1 * s + k2 * c,
+            ],
+            dim=-1,
+        )
+        q = torch.cat([q_rot, q_pass], dim=-1)
+        k = torch.cat([k_rot, k_pass], dim=-1)
+        v = qkv[:, :, 2]
+        return q, k, v
+
+
+# PhiCausalLMHead is the head of the PhiModel. It is a linear layer with a layer norm.
+class PhiCausalLMHead(nn.Module):
+    def __init__(self, config, weights):
+        super().__init__()
+        self.ln = nn.LayerNorm.load(
+            prefix="lm_head.ln",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+        self.linear = SpeculativeHead.load(
+            config=config, prefix="lm_head.linear", weights=weights
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.ln(hidden_states)
+        hidden_states = self.linear(hidden_states)
+        return hidden_states
+
+
+# PhiMHA is a multi-head attention layer. This layer uses an attention mask to prevent tokens from attending to subsequent tokens.
+class PhiMHA(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.Wqkv = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.out_proj",
+            weights=weights,
+            bias=not config.no_bias,
+        )
+        self.op_size = config.n_embd
+        self.head_dim = int(config.n_embd / config.n_head)
+        self.num_heads = config.n_head
+        self.rotary_emb = RotaryEmbedding(
+            config.rotary_dim,
+            config.n_positions,
+        )
+        self.softmax_scale = 1.0 / math.sqrt(self.head_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        past_kv_cache,
+        attention_mask=None,
+    ):
+        b_size, seq_len, _n_embd = hidden_states.shape
+        qkv = self.Wqkv(hidden_states)
+        qkv = qkv.view(b_size, seq_len, 3, self.num_heads, self.head_dim)
+        seqlen_offset = 0 if past_kv_cache is None else past_kv_cache[0].shape[1]
+        q, k, v = self.rotary_emb.apply_rotary_emb_qkv(qkv, seqlen_offset)
+
+        # if there is a kv_cache, then we need to concatenate
+        if past_kv_cache is not None:
+            prev_k, prev_v = past_kv_cache
+            k = torch.cat([prev_k, k], dim=1)
+            v = torch.cat([prev_v, v], dim=1)
+
+        past_kv_cache = [k, v]
+        attn_weights = torch.einsum("bthd,bshd->bhts", q, k * self.softmax_scale)
+
+        if attention_mask is not None:
+            seqlen_k = k.shape[1]
+            seqlen_q = q.shape[1]
+            causal_mask = torch.triu(
+                torch.full((seqlen_q, seqlen_k), -10000.0, device=attn_weights.device),
+                1,
+            )
+            attn_weights = attn_weights + causal_mask.to(dtype=attn_weights.dtype)
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = attn_weights.matmul(v.transpose(1, 2)).squeeze(0)
+        attn_output = (
+            attn_output.view((b_size, self.num_heads, seq_len, self.head_dim))
+            .transpose(1, 2)
+            .flatten(-2)
+        )
+        return self.out_proj(attn_output), past_kv_cache
+
+
+# PhiMLP is a multi-layer perceptron. It contains two linear layers with a gelu activation function.
+class PhiMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.n_inner = config.n_inner
+        self.fc1 = FastLinear.load(
+            config=config,
+            prefix=f"{prefix}.fc1",
+            weights=weights,
+            bias=False,
+        )
+        self.fc2 = FastLinear.load(
+            config=config,
+            prefix=f"{prefix}.fc2",
+            weights=weights,
+            bias=False,
+        )
+        self.activation = torch.nn.functional.gelu
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# PhiBlock is a single transformer block. It contains a layer norm, a multi-head attention layer and an multi-layer perceptron.
+class PhiBlock(nn.Module):
+    def __init__(self, layer_id, config, weights):
+        super().__init__()
+        self.layer_id = layer_id
+        self.layer_norm = nn.LayerNorm.load(
+            prefix=f"{layer_id}.ln", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.mixer = PhiMHA(prefix=f"{layer_id}.mixer", config=config, weights=weights)
+        self.mlp = PhiMLP(prefix=f"{layer_id}.mlp", config=config, weights=weights)
+
+    def forward(
+        self,
+        hidden_states,
+        kv_cache,
+        attention_mask,
+    ):
+        residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        attn_outputs, past_kv_cache = self.mixer(
+            hidden_states, kv_cache, attention_mask
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        out = attn_outputs + feed_forward_hidden_states + residual
+        return out, past_kv_cache
+
+
+# PhiModel implements the embedding layer and the transformer blocks.
+class PhiModel(nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+        self.tp_rank = weights.process_group.rank()
+        self.tp_world_size = weights.process_group.size()
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embd.wte", weights=weights
+        )
+        self.blocks = nn.ModuleList(
+            [
+                PhiBlock(f"{prefix}.h.{layer_id}", config, weights)
+                for layer_id in range(config.n_layer)
+            ]
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        hidden_states = self.embed_tokens(input_ids)
+        seq_len = hidden_states.shape[1]
+        mask = None if seq_len <= 1 else attention_mask
+
+        past_key_values = (
+            [None] * len(self.blocks) if past_key_values is None else past_key_values
+        )
+
+        for index, block in enumerate(self.blocks):
+            hidden_states, new_key_values = block(
+                hidden_states, past_key_values[index], mask
+            )
+            past_key_values[index] = new_key_values
+
+        return hidden_states, past_key_values
+
+
+# PhiForCausalLM wraps the PhiModel and PhiCausalLMHead together and returns a CausalLMOutputWithPast object.
+class PhiForCausalLM(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights):
+        super().__init__()
+
+        if not prefix:
+            prefix = "transformer"
+        else:
+            prefix = f"{prefix}.transformer"
+
+        self.model = PhiModel(prefix, config, weights)
+        self.lm_head = PhiCausalLMHead(config, weights)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        model_output = self.model(
+            input_ids, past_key_values, attention_mask, return_dict, use_cache
+        )
+        logits = self.lm_head(model_output[0])
+
+        loss = None
+        if labels is not None:
+            loss = nn.CrossEntropyLoss()(
+                logits[:, :-1].view(-1, logits.size(-1)), labels[:, 1:].view(-1)
+            )
+
+        if not return_dict:
+            return (
+                ((loss,) + (logits,) + model_output[1:])
+                if loss is not None
+                else (logits,) + model_output[1:]
+            )
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=model_output[1],
+            hidden_states=None,
+            attentions=None,
+        )
diff --git a/server/text_generation_server/models/custom_modeling/siglip.py b/server/text_generation_server/models/custom_modeling/siglip.py
new file mode 100644
index 00000000..5fbc6d29
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/siglip.py
@@ -0,0 +1,436 @@
+from typing import Optional, Tuple, Union
+
+import math
+import torch
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import (
+    _create_4d_causal_attention_mask,
+    _prepare_4d_attention_mask,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
+
+from text_generation_server.layers.tensor_parallel import (
+    TensorParallelEmbedding,
+    TensorParallelColumnLinear,
+    TensorParallelRowLinear,
+)
+
+
+class SiglipVisionEmbeddings(nn.Module):
+    def __init__(self, prefix, config: SiglipVisionConfig, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.patch_embedding.weight = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.weight"), requires_grad=False
+        )
+        self.patch_embedding.bias = nn.Parameter(
+            weights.get_tensor(f"{prefix}.patch_embedding.bias"), requires_grad=False
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = TensorParallelEmbedding(
+            prefix=f"{prefix}.position_embedding", weights=weights
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions, device=weights.device).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.head_size = self.head_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.embed_dim = self.embed_dim // weights.process_group.size()
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.k_proj", weights=weights, bias=True
+        )
+        self.v_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.v_proj", weights=weights, bias=True
+        )
+        self.q_proj = TensorParallelColumnLinear.load(
+            config, prefix=f"{prefix}.q_proj", weights=weights, bias=True
+        )
+        self.out_proj = TensorParallelRowLinear.load(
+            config, prefix=f"{prefix}.out_proj", weights=weights, bias=True
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        # scale post matmul
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) * self.scale
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(attn_weights.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_size):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_size)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_size)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipMLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = TensorParallelColumnLinear.load(  # config.hidden_size, config.intermediate_size
+            prefix=f"{prefix}.fc1", config=config, weights=weights, bias=True
+        )
+        self.fc2 = TensorParallelRowLinear.load(  # config.intermediate_size, config.hidden_size
+            prefix=f"{prefix}.fc2", config=config, weights=weights, bias=True
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+    def __init__(self, prefix, config: SiglipConfig, weights):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SiglipAttention(
+            prefix=f"{prefix}.self_attn", config=config, weights=weights
+        )
+        self.layer_norm1 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm1", weights=weights, eps=config.layer_norm_eps
+        )
+        self.mlp = SiglipMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+        self.layer_norm2 = nn.LayerNorm.load(
+            prefix=f"{prefix}.layer_norm2", weights=weights, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor]:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, None
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, prefix, config: SiglipVisionConfig, weights):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, batch_first=True
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(prefix, config, weights)
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+import warnings
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsquently scaled and shifted by the mean and std args.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+from torch.nn.init import _calculate_fan_in_and_fan_out
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+from transformers import PreTrainedModel
+
+
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SiglipEncoderLayer`].
+
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, prefix, config: SiglipConfig, weights):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    prefix=f"{prefix}.layers.{i}", config=config, weights=weights
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states, _ = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+        return hidden_states
+
+
+class SiglipVisionTransformer(nn.Module):
+    def __init__(self, prefix, config: SiglipVisionConfig, weights):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(
+            prefix=f"{prefix}.embeddings", config=config, weights=weights
+        )
+        self.encoder = SiglipEncoder(
+            prefix=f"{prefix}.encoder", config=config, weights=weights
+        )
+        self.post_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.post_layernorm",
+            weights=weights,
+            eps=config.layer_norm_eps,
+        )
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Returns:
+
+        """
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        # NOTE: up until this point, the code logits are exactly
+        # the same as the transformers code. The values evaulate
+        # slightly differently in our encoder layer.
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+        )
+        last_hidden_state = encoder_outputs
+        post_last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=post_last_hidden_state,
+            # pooler_output=pooled_output,
+            # hidden_states=encoder_outputs,
+        )
diff --git a/server/text_generation_server/models/custom_modeling/t5_modeling.py b/server/text_generation_server/models/custom_modeling/t5_modeling.py
index 793f3a66..0b899fba 100644
--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -38,11 +38,11 @@ from transformers.utils import (
     is_torch_fx_proxy,
 )
 from transformers import T5Config
-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
     TensorParallelColumnLinear,
     TensorParallelEmbedding,
     TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
 
 
@@ -1032,9 +1032,17 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
             embed_tokens=self.shared,
         )
 
-        self.lm_head = TensorParallelHead.load(
-            config, prefix="lm_head", weights=weights
-        )
+        try:
+            self.lm_head = SpeculativeHead.load(
+                config, prefix="lm_head", weights=weights
+            )
+        except RuntimeError:
+            # Some models like t5-small were saved with shared weights unlike flan
+            # Since they are declared as the same arch we have no choice but hope
+            # that this is OK instead of using a proper flag.
+            self.lm_head = SpeculativeHead.load(
+                config, prefix="shared", weights=weights
+            )
 
     def forward(
         self,
@@ -1118,7 +1126,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
             sequence_output = sequence_output * (self.model_dim**-0.5)
 
-        lm_logits = self.lm_head(sequence_output)
+        logits, speculative_logits = self.lm_head(sequence_output)
 
         loss = None
         if labels is not None:
@@ -1132,16 +1140,19 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
             output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
             return ((loss,) + output) if loss is not None else output
 
-        return Seq2SeqLMOutput(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
+        return (
+            Seq2SeqLMOutput(
+                loss=loss,
+                logits=logits,
+                past_key_values=decoder_outputs.past_key_values,
+                decoder_hidden_states=decoder_outputs.hidden_states,
+                decoder_attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+                encoder_hidden_states=encoder_outputs.hidden_states,
+                encoder_attentions=encoder_outputs.attentions,
+            ),
+            speculative_logits,
         )
 
     def prepare_inputs_for_generation(
diff --git a/server/text_generation_server/models/custom_modeling/vlm.py b/server/text_generation_server/models/custom_modeling/vlm.py
new file mode 100644
index 00000000..b74b43ff
--- /dev/null
+++ b/server/text_generation_server/models/custom_modeling/vlm.py
@@ -0,0 +1,48 @@
+def load_text_model(prefix, config, weights, name=None):
+    if config.model_type == "llama":
+        from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+            FlashLlamaForCausalLM,
+        )
+
+        return FlashLlamaForCausalLM(prefix, config, weights)
+    elif config.model_type == "mistral":
+        from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
+            FlashMistralForCausalLM,
+        )
+
+        return FlashMistralForCausalLM(prefix, config, weights, name=name)
+    elif config.model_type == "gemma":
+        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+            FlashGemmaForCausalLM,
+        )
+
+        return FlashGemmaForCausalLM(prefix, config, weights, causal=False)
+    elif config.model_type == "paligemma":
+        from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
+            FlashGemmaForCausalLM,
+        )
+
+        return FlashGemmaForCausalLM(prefix, config, weights)
+    else:
+        raise RuntimeError(f"Unsupported model type {config.model_type}")
+
+
+def load_vision_model(prefix, config, weights):
+    if config.model_type == "clip_vision_model":
+        from text_generation_server.models.custom_modeling.clip import (
+            CLIPVisionTransformer,
+        )
+
+        return CLIPVisionTransformer(
+            prefix=f"{prefix}.vision_model", config=config, weights=weights
+        )
+    if config.model_type == "siglip_vision_model":
+        from text_generation_server.models.custom_modeling.siglip import (
+            SiglipVisionTransformer,
+        )
+
+        return SiglipVisionTransformer(
+            prefix=f"vision_tower.vision_model", config=config, weights=weights
+        )
+    else:
+        raise RuntimeError(f"Unsupported model type {config.model_type}")
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index d6af07f4..e66011a1 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -1,116 +1,78 @@
 import math
+import os
+import time
 import itertools
-from text_generation_server.utils.tokens import batch_top_tokens
 import torch
 import torch.distributed
 
 import numpy as np
 
+from loguru import logger
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import PreTrainedTokenizerBase
-from typing import Optional, Tuple, List, Type, Union, Dict
+from transformers import (
+    PreTrainedTokenizerBase,
+    AutoConfig,
+    AutoTokenizer,
+    GenerationConfig,
+)
+from typing import Iterable, Optional, Tuple, List, Type, Dict
 
+from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models import Model
+from text_generation_server.utils.tokens import batch_top_tokens
+from text_generation_server.utils.dist import RANK
+from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+    hub,
+)
 from text_generation_server.models.types import (
     Batch,
-    PrefillTokens,
+    Tokens,
     Generation,
     GeneratedText,
-    TopTokens,
 )
 from text_generation_server.pb import generate_pb2
+from text_generation_server.models.globals import (
+    MEM_POOL,
+    FLASH_DECODING,
+    BLOCK_SIZE,
+    CUDA_GRAPHS,
+    get_adapter_to_index,
+    MODEL_ID,
+)
+from text_generation_server.layers.attention import Seqlen
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
+from text_generation_server.utils.segments import SegmentConcatBuilder, find_segments
+
+from text_generation_server.utils.import_utils import (
+    empty_cache,
+    synchronize,
+    get_free_memory,
+)
 
 tracer = trace.get_tracer(__name__)
 
-BLOCK_SIZE = 16
-# Will be set in warmup
-CACHE_MANAGER: Optional["CacheManager"] = None
+
+# Will be set in init
+SLIDING_WINDOW: Optional[int] = None
 
 
-class CacheManager:
-    def __init__(
-        self,
-        num_blocks: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ):
-        self.block_size = BLOCK_SIZE
-        self.num_blocks = num_blocks
+def set_sliding_window(sliding_window: int):
+    global SLIDING_WINDOW
+    SLIDING_WINDOW = sliding_window
 
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        x = self.block_size // element_size
 
-        self.kv_cache = [
-            (
-                torch.empty(
-                    (num_blocks, num_heads, head_size // x, self.block_size, x),
-                    dtype=dtype,
-                    device=device,
-                ),
-                torch.empty(
-                    (num_blocks, num_heads, head_size, self.block_size),
-                    dtype=dtype,
-                    device=device,
-                ),
-            )
-            for _ in range(num_layers)
-        ]
-        self.free_block_mask = torch.ones(num_blocks, dtype=torch.int32, device="cpu")
-        self.slots = torch.arange(
-            0, num_blocks * self.block_size, dtype=torch.int32
-        ).view(num_blocks, self.block_size)
-
-    def allocate(self, batch: "FlashCausalLMBatch"):
-        # Get free blocks indices by finding values in mask that are not set to 0
-        free_block_indices = self.free_block_mask.nonzero()
-        assert (
-            len(free_block_indices) >= batch.blocks
-        ), f"Out of available cache blocks: asked {batch.blocks}, only {len(free_block_indices)} free blocks"
-
-        # Slice by the number of required blocks
-        block_indices = free_block_indices[: batch.blocks]
-        block_indices = block_indices.flatten()
-
-        # Padded block tables
-        block_tables_tensor = torch.zeros(
-            (len(batch), batch.max_blocks), dtype=torch.int32
-        )
-
-        # Allocate paged attention blocks
-        cumulative_blocks = 0
-        slots = []
-        block_tables = []
-        for i, (needed_blocks, needed_slots) in enumerate(batch.needed_blocks_slots):
-            # Get allocated blocks for this sequence
-            allocated_blocks = block_indices[
-                cumulative_blocks : cumulative_blocks + needed_blocks
-            ]
-            # Get slots for the allocated blocks
-            allocated_slots = self.slots[allocated_blocks].flatten()[:needed_slots]
-
-            slots.append(allocated_slots)
-            block_tables.append(allocated_blocks.tolist())
-            block_tables_tensor[i, :needed_blocks] = allocated_blocks
-            cumulative_blocks += needed_blocks
-
-        batch.needed_blocks_slots = None
-        batch.block_tables = block_tables
-        batch.block_tables_tensor = block_tables_tensor.to(batch.input_ids.device)
-        batch.slots = torch.concat(slots).to(batch.input_ids.device)
-
-        # Allocate the required number of blocks by setting the mask to 0
-        self.free_block_mask[block_indices] = 0
-
-    def free(self, block_indices: Optional[List[int]]):
-        if block_indices is not None and block_indices:
-            # Reset mask
-            self.free_block_mask[block_indices] = 1
+def get_sliding_windows() -> int:
+    global SLIDING_WINDOW
+    return SLIDING_WINDOW
 
 
 @dataclass
@@ -123,11 +85,15 @@ class FlashCausalLMBatch(Batch):
     # Decoder values
     input_ids: torch.Tensor
     position_ids: torch.Tensor
+    speculative_ids: Optional[torch.Tensor]
 
     # Flash Attention values
 
     # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
     cu_seqlen_prefill: Optional[torch.Tensor]
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor]
 
     # Paged Attention values
 
@@ -136,16 +102,13 @@ class FlashCausalLMBatch(Batch):
     start_slots: torch.Tensor
     # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
     slot_indices: torch.Tensor
-    # List of tuple of ints representing the number of blocks and slots needed by each sequence
-    needed_blocks_slots: Optional[List[Tuple[int, int]]]
 
-    # Set in prefill by the CacheManager
     # list of length b of list of length s_i // block_size
-    block_tables: Optional[List[List[int]]]
-    # tensor of size [b, max_seqlen // block_size] holding the paged attention block tables for all sequences
-    block_tables_tensor: Optional[torch.Tensor]
+    block_tables: List[List[int]]
+    # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences
+    block_tables_tensor: torch.Tensor
     # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
-    slots: Optional[torch.Tensor]
+    slots: torch.Tensor
 
     max_seqlen: int
 
@@ -170,8 +133,11 @@ class FlashCausalLMBatch(Batch):
     top_n_tokens: List[int]
     top_n_tokens_tensor: torch.Tensor
 
+    # Adapter metadata for each request
+    adapter_meta: AdapterBatchMetadata
+
     # Number of blocks in this batch
-    blocks: int
+    num_blocks: int
     # Maximum number of blocks
     max_blocks: int
 
@@ -180,32 +146,39 @@ class FlashCausalLMBatch(Batch):
             id=self.batch_id,
             request_ids=[r.id for r in self.requests],
             size=len(self),
-            max_tokens=self.blocks * BLOCK_SIZE,
+            max_tokens=self.num_blocks * BLOCK_SIZE,
         )
 
     @classmethod
-    def from_pb(
-        cls,
-        pb: generate_pb2.Batch,
-        tokenizer: PreTrainedTokenizerBase,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> "FlashCausalLMBatch":
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[generate_pb2.Request], tokenizer
+    ):
         batch_inputs = []
         max_truncation = 0
-        for r in pb.requests:
-            batch_inputs.append(r.inputs)
+        for r in requests:
+            batch_inputs.append(concat_text_chunks(r.input_chunks.chunks))
             max_truncation = max(max_truncation, r.truncate)
 
         batch_tokenized_inputs = tokenizer(
             batch_inputs, truncation=True, max_length=max_truncation
         )["input_ids"]
+        return batch_tokenized_inputs
 
+    @classmethod
+    def from_tokenized(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        batch_tokenized_inputs,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "FlashCausalLMBatch":
+        sliding_window = get_sliding_windows()
         position_ids = []
         cu_seqlen_prefill = [0]
-        needed_blocks_slots = []
         start_slots = []
         slot_indices = []
+        prefill_cache_indices = []
 
         input_lengths = []
         prefix_offsets = []
@@ -223,16 +196,22 @@ class FlashCausalLMBatch(Batch):
         stopping_criterias = []
         top_n_tokens = []
 
+        adapter_indices_list = []
+        adapter_set = set()
+
         # Cumulative length
         cumulative_length = 0
         cumulative_max_length = 0
         prefill_out_cumulative_length = 0
 
-        blocks = 0
+        num_blocks = 0
         max_seqlen = 0
         max_length = 0
         max_blocks = 0
 
+        block_tables = []
+        slots = []
+
         # Parse batch
         for i, (r, tokenized_input) in enumerate(
             zip(pb.requests, batch_tokenized_inputs)
@@ -241,6 +220,11 @@ class FlashCausalLMBatch(Batch):
             requests_idx_mapping[r.id] = i
 
             tokenized_input = tokenized_input[-r.truncate :]
+            if (
+                tokenized_input[0] == tokenizer.bos_token_id
+                and tokenized_input[1] == tokenizer.bos_token_id
+            ):
+                tokenized_input = tokenized_input[1:]
 
             input_length = len(tokenized_input)
             input_lengths.append(input_length)
@@ -266,12 +250,35 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias.append(stopping_criteria)
             top_n_tokens.append(r.top_n_tokens)
 
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
+            adapter_indices_list.append(torch.full((input_length,), adapter_index))
+            adapter_set.add(adapter_index)
+
             # Paged attention
             # Remove one as the first token des not have a past
-            total_tokens = input_length + max_new_tokens - 1
-            needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
-            blocks += needed_blocks
-            needed_blocks_slots.append((needed_blocks, total_tokens))
+            speculative_length = get_speculate()
+            speculative_length = 0 if speculative_length is None else speculative_length
+            total_tokens = input_length + max_new_tokens - 1 + speculative_length
+
+            # blocks and slots can be empty (for example in warmup)
+            if not r.blocks:
+                needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
+                request_blocks = [
+                    b for b in range(num_blocks, num_blocks + needed_blocks)
+                ]
+                request_slots = [
+                    s
+                    for b in request_blocks
+                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
+                ]
+            else:
+                request_blocks = r.blocks
+                request_slots = r.slots
+
+            block_tables.append(request_blocks)
+            slots.extend(request_slots[:total_tokens])
+            num_blocks += len(request_blocks)
             start_slots.append(cumulative_max_length)
 
             request_slot_indices = torch.arange(
@@ -281,6 +288,15 @@ class FlashCausalLMBatch(Batch):
             )
             slot_indices.append(request_slot_indices)
 
+            # Create tensor to slice into the kv tensor in prefill
+            if sliding_window is not None:
+                request_prefill_cache_indices = torch.arange(
+                    cumulative_length + max(0, input_length - sliding_window),
+                    cumulative_length + input_length,
+                    dtype=torch.int64,
+                )
+                prefill_cache_indices.append(request_prefill_cache_indices)
+
             all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
             no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
 
@@ -305,11 +321,17 @@ class FlashCausalLMBatch(Batch):
             cumulative_length += input_length
             cumulative_max_length += total_tokens
             max_seqlen = max(max_seqlen, input_length)
-            max_blocks = max(max_blocks, needed_blocks)
-            max_length = max(max_length, input_length + max_new_tokens)
+            max_blocks = max(max_blocks, len(request_blocks))
+            max_length = max(
+                max_length, input_length + max_new_tokens + speculative_length
+            )
+
+        adapter_indices = torch.cat(adapter_indices_list).to(
+            dtype=torch.int64, device=device
+        )
 
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
-            next_token_chooser_parameters, dtype, device
+            next_token_chooser_parameters, dtype, device, tokenizer
         )
         start_slots = torch.tensor(start_slots, dtype=torch.int64)
 
@@ -329,22 +351,33 @@ class FlashCausalLMBatch(Batch):
             input_ids = np.concatenate(all_input_ids, dtype=np.int64)
             position_ids = torch.cat(position_ids)
             slot_indices = torch.cat(slot_indices)
+            if sliding_window is not None:
+                prefill_cache_indices = torch.cat(prefill_cache_indices)
         else:
             input_ids = all_input_ids[0]
             position_ids = position_ids[0]
             slot_indices = slot_indices[0]
+            if sliding_window is not None:
+                prefill_cache_indices = prefill_cache_indices[0]
 
         cu_seqlen_prefill = torch.tensor(
             cu_seqlen_prefill, device=device, dtype=torch.int32
         )
-
         position_ids = position_ids.to(device)
         slot_indices = slot_indices.to(device)
+        prefill_cache_indices = (
+            prefill_cache_indices.to(device) if sliding_window is not None else None
+        )
         input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
         input_lengths_tensor = torch.tensor(
             input_lengths, dtype=torch.int32, device=device
         )
 
+        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+
         if all_prefill_logprobs:
             prefill_head_indices = None
             prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
@@ -362,6 +395,14 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens, device=device, dtype=torch.int64
         )
 
+        slots = torch.tensor(slots, dtype=torch.int64, device=device)
+        block_tables_tensor = torch.zeros(
+            (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
+        )
+        for i, request_blocks in enumerate(block_tables):
+            block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)
+        block_tables_tensor = block_tables_tensor.to(device)
+
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
@@ -369,12 +410,12 @@ class FlashCausalLMBatch(Batch):
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlen_prefill=cu_seqlen_prefill,
+            prefill_cache_indices=prefill_cache_indices,
             start_slots=start_slots,
             slot_indices=slot_indices,
-            needed_blocks_slots=needed_blocks_slots,
-            block_tables=None,
-            block_tables_tensor=None,
-            slots=None,
+            block_tables=block_tables,
+            block_tables_tensor=block_tables_tensor,
+            slots=slots,
             max_seqlen=max_seqlen,
             prefill_head_indices=prefill_head_indices,
             prefill_next_token_indices=prefill_next_token_indices,
@@ -389,10 +430,28 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
-            blocks=blocks,
+            num_blocks=num_blocks,
             max_blocks=max_blocks,
+            adapter_meta=AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            ),
+            speculative_ids=None,
         )
 
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "FlashCausalLMBatch":
+        batch_tokenized_inputs = cls.batch_tokenized_inputs(pb.requests, tokenizer)
+        return cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+
     @tracer.start_as_current_span("filter")
     def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
         if len(request_ids) == 0:
@@ -429,8 +488,9 @@ class FlashCausalLMBatch(Batch):
 
         stopping_criterias = []
         top_n_tokens = []
+        adapter_set = set()
 
-        blocks = 0
+        num_blocks = 0
         max_blocks = 0
         # Cumulative length
         cumulative_max_length = 0
@@ -457,12 +517,16 @@ class FlashCausalLMBatch(Batch):
 
             top_n_tokens.append(self.top_n_tokens[idx])
 
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
+            adapter_set.add(adapter_index)
+
             remaining_tokens = (
                 stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
             )
 
             request_block_table = self.block_tables[idx]
-            blocks += len(request_block_table)
+            num_blocks += len(request_block_table)
             block_tables.append(request_block_table)
             start_slots.append(cumulative_max_length)
 
@@ -481,43 +545,40 @@ class FlashCausalLMBatch(Batch):
 
             max_blocks = max(max_blocks, len(request_block_table))
 
-        global CACHE_MANAGER
-        block_indices_to_free = []
-        # Iterate on all requests
-        for i, r in enumerate(self.requests):
-            # Filter requests that are not part of the new batch
-            if r.id not in requests_idx_mapping.keys():
-                block_indices_to_free.extend(self.block_tables[i])
-        # Free blocks
-        CACHE_MANAGER.free(block_indices_to_free)
-        # Needed to avoid dropping blocks when the batches will go out of scope
-        self.block_tables = None
-
         # Index into tensors
         input_ids = self.input_ids[indices]
         position_ids = self.position_ids[indices]
+        adapter_indices = self.adapter_meta.adapter_indices[indices]
         all_input_ids_tensor = self.all_input_ids_tensor[indices]
         block_tables_tensor = self.block_tables_tensor[indices]
         input_lengths_tensor = self.input_lengths_tensor[indices]
         slots = self.slots[slot_filtering_indices]
         next_token_chooser = self.next_token_chooser.filter(indices)
         top_n_tokens_tensor = self.top_n_tokens_tensor[indices]
+        speculative_ids = (
+            self.speculative_ids[indices] if self.speculative_ids is not None else None
+        )
 
         start_slots = torch.tensor(start_slots, dtype=torch.int64)
 
         # Move to GPU now that we have the whole tensor
         slot_indices = slot_indices.to(device)
 
-        return FlashCausalLMBatch(
+        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+
+        return type(self)(
             batch_id=self.batch_id,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
             start_slots=start_slots,
             slot_indices=slot_indices,
-            needed_blocks_slots=None,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
             slots=slots,
@@ -535,8 +596,15 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
-            blocks=blocks,
+            num_blocks=num_blocks,
             max_blocks=max_blocks,
+            speculative_ids=speculative_ids,
+            adapter_meta=AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            ),
         )
 
     @classmethod
@@ -546,7 +614,7 @@ class FlashCausalLMBatch(Batch):
         requests = []
         requests_idx_mapping = {}
 
-        blocks = 0
+        num_blocks = 0
         total_batch_size = 0
         total_slots = 0
         max_blocks = 0
@@ -555,7 +623,10 @@ class FlashCausalLMBatch(Batch):
         for b in batches:
             total_batch_size += len(b)
             total_slots += len(b.slots)
-            blocks += b.blocks
+            num_blocks += b.num_blocks
+            speculative_length = (
+                b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
+            )
             max_blocks = max(max_blocks, b.max_blocks)
             max_seqlen = max(max_seqlen, b.max_seqlen)
             max_length = max(
@@ -563,6 +634,7 @@ class FlashCausalLMBatch(Batch):
                 max(
                     input_length
                     + stopping_criteria.max_new_tokens
+                    + speculative_length
                     - stopping_criteria.current_tokens
                     for input_length, stopping_criteria in zip(
                         b.input_lengths, b.stopping_criterias
@@ -586,6 +658,14 @@ class FlashCausalLMBatch(Batch):
         top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
             total_batch_size,
         )
+        total_indices_size = sum(
+            b.adapter_meta.adapter_indices.shape[0] for b in batches
+        )
+        adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
+            total_indices_size
+        )
+        adapter_set = set()
+        adapter_segment_builder = SegmentConcatBuilder()
 
         start_slots = []
         block_tables = []
@@ -596,12 +676,14 @@ class FlashCausalLMBatch(Batch):
         read_offsets = []
 
         next_token_chooser_parameters = []
+        fsm_grammar_states = []
         stopping_criterias = []
         top_n_tokens = []
 
         # Cumulative length
         cumulative_batch_size = 0
         cumulative_slots = 0
+        cumulative_adapter_indices_size = 0
 
         for i, batch in enumerate(batches):
             requests.extend(batch.requests)
@@ -626,6 +708,21 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
             slots[slots_start_index:slots_end_index] = batch.slots
 
+            # Copy over adapter indices
+            adapter_start_index = cumulative_adapter_indices_size
+            adapter_end_index = (
+                cumulative_adapter_indices_size
+                + batch.adapter_meta.adapter_indices.shape[0]
+            )
+            adapter_indices[adapter_start_index:adapter_end_index] = (
+                batch.adapter_meta.adapter_indices
+            )
+            cumulative_adapter_indices_size = adapter_end_index
+            adapter_set.update(batch.adapter_meta.adapter_set)
+            adapter_segment_builder.concat(
+                batch.adapter_meta.adapter_segments, batch.adapter_meta.segment_indices
+            )
+
             all_input_ids_tensor[
                 start_index:end_index, : batch.all_input_ids_tensor.shape[1]
             ] = batch.all_input_ids_tensor[:, :max_length]
@@ -644,6 +741,7 @@ class FlashCausalLMBatch(Batch):
             read_offsets.extend(batch.read_offsets)
 
             next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
+            fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
             stopping_criterias.extend(batch.stopping_criterias)
 
             top_n_tokens.extend(batch.top_n_tokens)
@@ -658,23 +756,28 @@ class FlashCausalLMBatch(Batch):
             next_token_chooser_parameters,
             dtype=batches[0].next_token_chooser.dtype,
             device=batches[0].next_token_chooser.device,
+            tokenizer=batches[0].next_token_chooser.tokenizer,
+            fsm_grammar_states=fsm_grammar_states,
         )
 
-        # Needed to avoid dropping blocks when the batches will go out of scope
-        for b in batches:
-            b.block_tables = None
-            del b
+        speculative_ids = (
+            torch.cat([b.speculative_ids for b in batches], dim=0)
+            if batches[0].speculative_ids is not None
+            else None
+        )
 
-        return FlashCausalLMBatch(
+        adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+
+        return cls(
             batch_id=batches[0].batch_id,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
             input_ids=input_ids,
             position_ids=position_ids,
             cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
             start_slots=start_slots,
             slot_indices=slot_indices,
-            needed_blocks_slots=None,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
             slots=slots,
@@ -692,38 +795,131 @@ class FlashCausalLMBatch(Batch):
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
-            blocks=blocks,
+            num_blocks=num_blocks,
             max_blocks=max_blocks,
+            speculative_ids=speculative_ids,
+            adapter_meta=AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            ),
         )
 
-    def __del__(self):
-        if self.block_tables is not None and self.block_tables:
-            global CACHE_MANAGER
-            # Free blocks
-            CACHE_MANAGER.free(list(itertools.chain.from_iterable(self.block_tables)))
-
     def __len__(self):
         return len(self.requests)
 
 
+ADAPTER_LAYERS = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
+
+
 class FlashCausalLM(Model):
     def __init__(
         self,
-        model: torch.nn.Module,
-        tokenizer: PreTrainedTokenizerBase,
-        num_layers: int,
-        num_kv_heads: int,
-        head_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        rank: int = 0,
-        world_size: int = 1,
+        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+        lora_adapter_ids: Optional[list] = [],
+        tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer,
+        config_class: PreTrainedTokenizerBase = AutoConfig,
+        default_dtype=torch.float16,
+        aliases=None,
+        # Used for Santacoder override of config
+        num_kv_heads=None,
+        skip_special_tokens: bool = True,
     ):
-        self.num_layers = num_layers
-        self.num_kv_heads = num_kv_heads
-        self.head_size = head_size
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            raise NotImplementedError(f"{model_class} is only available on GPU")
 
-        super(FlashCausalLM, self).__init__(
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                model_id, revision=revision, trust_remote_code=trust_remote_code
+            )
+            if isinstance(generation_config.eos_token_id, (list, set)):
+                # TODO Huge hack
+                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
+        except Exception:
+            pass
+
+        config = config_class.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+
+        torch.distributed.barrier(group=self.process_group)
+
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames, device, dtype, process_group=self.process_group, aliases=aliases
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+
+        prefix = ""
+        model = model_class(prefix, config, weights)
+        torch.distributed.barrier(group=self.process_group)
+
+        # VLM models define the config we care about in their text_config
+        text_config = getattr(config, "text_config", None)
+        if text_config is not None:
+            config = text_config
+
+        if getattr(config, "sliding_window", None) is not None:
+            set_sliding_window(config.sliding_window)
+        else:
+            config.sliding_window = None
+
+        self.num_layers = config.num_hidden_layers
+        # Validation is done in the model itself
+        if num_kv_heads is None:
+            # Order is important here.
+            for attr in ["num_key_value_heads", "num_key_value_heads", "n_head"]:
+                num_kv_heads = getattr(config, "num_attention_heads", None)
+                if num_kv_heads is not None:
+                    break
+            if num_kv_heads is None:
+                raise ValueError("Cannot get the number of key/value heads")
+        self.num_kv_heads = num_kv_heads // self.process_group.size()
+        self.head_size = config.hidden_size // config.num_attention_heads
+
+        self.cuda_graphs = {}
+        self.kv_cache = []
+
+        super().__init__(
+            model_id=model_id,
             model=model,
             tokenizer=tokenizer,
             requires_padding=False,
@@ -731,33 +927,166 @@ class FlashCausalLM(Model):
             device=device,
             rank=rank,
             world_size=world_size,
+            sliding_window=config.sliding_window,
         )
 
     @property
     def batch_type(self) -> Type[FlashCausalLMBatch]:
         return FlashCausalLMBatch
 
-    def warmup(self, batch: FlashCausalLMBatch):
-        global CACHE_MANAGER
+    def max_past(self) -> int:
+        return getattr(self.model, "max_past", None)
+
+    def init_kv_cache(
+        self,
+        num_blocks: int,
+        num_layers: int,
+        num_heads: int,
+        head_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        self.kv_cache = []
+        empty_cache()
+
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        if SYSTEM == "ipex" and device.type == "xpu":
+            x = 1
+        else:
+            x = BLOCK_SIZE // element_size
+
+        if FLASH_DECODING:
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                    torch.empty(
+                        (num_blocks, BLOCK_SIZE, num_heads, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+        elif SYSTEM == "ipex" and device == torch.device("cpu"):
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                    torch.empty(
+                        (num_blocks, num_heads, BLOCK_SIZE, head_size),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+        else:
+            self.kv_cache = [
+                (
+                    torch.empty(
+                        (num_blocks, num_heads, head_size // x, BLOCK_SIZE, x),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                    torch.empty(
+                        (num_blocks, num_heads, head_size, BLOCK_SIZE),
+                        dtype=dtype,
+                        device=device,
+                    ),
+                )
+                for _ in range(num_layers)
+            ]
+
+    def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
+        input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
+        position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
+        slots = torch.arange(bs, dtype=torch.int64, device=self.device)
+        input_lengths = torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
+        block_tables = (
+            torch.arange(max_bt, dtype=torch.int32, device=self.device)
+            .repeat(bs)
+            .reshape((bs, max_bt))
+        )
+
+        self.cuda_graphs[bs] = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "kv_cache": self.kv_cache,
+            "block_tables": block_tables,
+            "slots": slots,
+            "input_lengths": input_lengths,
+        }
+        input_lengths_ = Seqlen(input_lengths=input_lengths)
+        graph = torch.cuda.CUDAGraph()
+        self.cuda_graphs[bs]["graph"] = graph
+
+        torch.cuda.synchronize()
+        # Run once outside to warmup
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=None,
+            kv_cache=self.kv_cache,
+            block_tables=block_tables,
+            slots=slots,
+            input_lengths=input_lengths_,
+            max_s=max_s,
+            prefill_cache_indices=None,
+            lm_head_indices=None,
+        )
+        torch.cuda.synchronize()
+
+        with torch.cuda.graph(graph, pool=MEM_POOL):
+            input_lengths = Seqlen(input_lengths=input_lengths)
+            logits, speculative_logits = self.model.forward(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                cu_seqlen_prefill=None,
+                kv_cache=self.kv_cache,
+                block_tables=block_tables,
+                slots=slots,
+                input_lengths=input_lengths,
+                max_s=max_s,
+                prefill_cache_indices=None,
+                lm_head_indices=None,
+            )
+            self.cuda_graphs[bs]["logits"] = logits
+            self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
+        torch.cuda.synchronize()
+
+    def warmup(self, batch: FlashCausalLMBatch):
+        # The warmup batch is the biggest batch we could ever receive
+        empty_cache()
 
-        torch.cuda.empty_cache()
         try:
-            CACHE_MANAGER = CacheManager(
-                batch.blocks,
+            self.init_kv_cache(
+                batch.num_blocks,
                 self.num_layers,
                 self.num_kv_heads,
                 self.head_size,
                 self.dtype,
                 self.device,
             )
-            _, batch = self.generate_token(batch)
-        except Exception as e:
+            max_bt = batch.max_blocks
+            max_s = max_bt * BLOCK_SIZE
+
+            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
+                torch.cuda.tunable.tuning_enable(False)
+            _, batch, _ = self.generate_token(batch)
+        except torch.cuda.OutOfMemoryError as e:
             raise RuntimeError(
                 f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
-        torch.cuda.synchronize(self.device)
+        synchronize(self.device)
 
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
         # Calculate the number of blocks that can be allocated with the free memory
@@ -765,24 +1094,19 @@ class FlashCausalLM(Model):
         cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
         total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
 
-        total_free_memory, _ = torch.cuda.mem_get_info(self.device)
-        total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
-
-        free_memory = max(
-            0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
-        )
+        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
+        batch_num_blocks = batch.num_blocks if batch is not None else 0
 
         num_blocks = (
-            int(free_memory // total_cache_size)
-            # Add batch.blocks as we allocated it above, so it is included in the peak memory.
-            + CACHE_MANAGER.num_blocks
+            # Leave 5% for some wiggle room
+            int((free_memory * 0.95) // total_cache_size)
+            # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
+            + batch_num_blocks
         )
 
-        del CACHE_MANAGER
         del batch
-        torch.cuda.empty_cache()
 
-        CACHE_MANAGER = CacheManager(
+        self.init_kv_cache(
             num_blocks,
             self.num_layers,
             self.num_kv_heads,
@@ -791,78 +1115,275 @@ class FlashCausalLM(Model):
             self.device,
         )
 
+        if SYSTEM == "rocm":
+            if (
+                os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
+                or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
+            ):
+                torch.cuda.tunable.enable()
+
+                if os.environ.get("PYTORCH_TUNABLEOP_TUNING") != "0":
+                    torch.cuda.tunable.tuning_enable(True)
+
+                if os.environ.get("PYTORCH_TUNABLEOP_SEQLENS") is not None:
+                    tuning_sequences = [
+                        int(val)
+                        for val in os.environ["PYTORCH_TUNABLEOP_SEQLENS"].split(",")
+                    ]
+                elif CUDA_GRAPHS is not None:
+                    tuning_sequences = CUDA_GRAPHS
+                else:
+                    # For seqlen = 1, we dispatch to LLMM1 kernel.
+                    tuning_sequences = [2, 3, 4, 5, 6, 7]
+
+                tunableop_filepath = os.path.join(
+                    HUGGINGFACE_HUB_CACHE,
+                    f"tunableop_{MODEL_ID.replace('/', '-')}_tp{self.world_size}_rank{self.rank}.csv",
+                )
+
+                logger.info(
+                    f"PyTorch TunableOp (https://github.com/fxmarty/pytorch/tree/2.3-patched/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`."
+                )
+
+                if os.path.isfile(tunableop_filepath):
+                    logger.info(
+                        f"The file {tunableop_filepath} already exists and will be reused."
+                    )
+                    torch.cuda.tunable.read_file(tunableop_filepath)
+
+                os.makedirs(HUGGINGFACE_HUB_CACHE, exist_ok=True)
+
+                for seqlen in tuning_sequences:
+                    logger.info(f"Warming up TunableOp for seqlen={seqlen}")
+                    self.tunableop_warmup(seqlen)
+                    torch.cuda.tunable.write_file(tunableop_filepath)
+                torch.cuda.tunable.tuning_enable(False)
+            else:
+                logger.info(
+                    "PyTorch ROCm TunableOp (https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/cuda/tunable) is disabled. TunableOp brings an additional 5-8% latency improvement for small sequence lengths but requires a warmup. If necessary, please use the environment variable PYTORCH_TUNABLEOP_ENABLED=1 to enable TunableOp."
+                )
+
+        if CUDA_GRAPHS:
+            try:
+                logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
+                # Warmup cuda graphs
+                for bs in CUDA_GRAPHS:
+                    if self.speculate is None or self.speculate + 1 <= bs:
+                        self.cuda_graph_warmup(bs, max_s, max_bt)
+            except torch.cuda.OutOfMemoryError:
+                logger.exception(f"Decode cuda graph warmup failed")
+        else:
+            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
+
         return int(num_blocks * BLOCK_SIZE)
 
-    def decode(self, generated_ids: Union[torch.Tensor, List[int]]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    def tunableop_warmup(self, seqlen: int):
+        input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
+        position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
+        slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
+
+        # Dummy value, some models (starcoder2) don't accept `None`.
+        input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
+        input_lengths = Seqlen(input_lengths=input_lengths)
+
+        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
+        self.model.forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlen_prefill=torch.tensor(
+                [0, seqlen], device=self.device, dtype=torch.int32
+            ),
+            kv_cache=self.kv_cache,
+            block_tables=None,
+            input_lengths=input_lengths,
+            slots=slots,
+            max_s=seqlen,
+            lm_head_indices=None,
+            prefill_cache_indices=None,
         )
 
     def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        cu_seqlen_prefill: Optional[torch.Tensor],
-        block_tables: torch.Tensor,
-        slots: torch.Tensor,
-        input_lengths: torch.Tensor,
-        max_s: int,
-        lm_head_indices: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        global CACHE_MANAGER
-
+        self, batch: FlashCausalLMBatch, adapter_data: AdapterBatchData
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # Model Forward
-        return self.model.forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            kv_cache=CACHE_MANAGER.kv_cache,
-            block_tables=block_tables,
-            slots=slots,
-            input_lengths=input_lengths,
-            max_s=max_s,
-            lm_head_indices=lm_head_indices,
+        if batch.speculative_ids is not None:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+            speculative_ids = batch.speculative_ids
+
+            B, speculative_length = speculative_ids.shape
+            new_length = speculative_length + 1
+            new_input_ids = torch.cat(
+                [input_ids.unsqueeze(-1), speculative_ids], dim=1
+            ).reshape(-1)
+            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
+            arange_int = arange.to(dtype=torch.int32)
+            new_position_ids = (
+                position_ids.unsqueeze(-1).expand(B, new_length) + arange
+            ).view(-1)
+            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+            input_lengths = (
+                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+
+            # Add Copy the block tables for all members
+            block_tables = (
+                block_tables.unsqueeze(1)
+                .expand(B, new_length, -1)
+                .reshape(B * new_length, -1)
+                .contiguous()
+            )
+            max_s = max_s + speculative_length
+
+            input_ids = new_input_ids
+            position_ids = new_position_ids
+        else:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        bs = input_ids.shape[0]
+        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
+        if sorted_padded_bs:
+            # Get associated cuda graph
+            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
+        else:
+            cuda_graph = None
+
+        if cu_seqlen_prefill is not None or cuda_graph is None:
+            input_lengths = Seqlen(input_lengths=input_lengths)
+            logits, speculative_logits = self.model.forward(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                kv_cache=kv_cache,
+                block_tables=block_tables,
+                slots=slots,
+                input_lengths=input_lengths,
+                max_s=max_s,
+                prefill_cache_indices=batch.prefill_cache_indices,
+                lm_head_indices=lm_head_indices,
+                adapter_data=adapter_data,
+            )
+            if batch.prefill_cache_indices is not None:
+                batch.prefill_cache_indices = None
+            return logits, speculative_logits
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
+        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
+        cuda_graph["block_tables"][
+            : block_tables.shape[0], : block_tables.shape[1]
+        ] = block_tables
+        cuda_graph["slots"].fill_(-1)
+        cuda_graph["slots"][: slots.shape[0]] = slots
+        cuda_graph["input_lengths"].zero_()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+
+        # Replay the graph
+        cuda_graph["graph"].replay()
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
         )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
 
     @tracer.start_as_current_span("generate_token")
     def generate_token(
         self, batch: FlashCausalLMBatch
-    ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch]]:
+    ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
         prefill = batch.cu_seqlen_prefill is not None
         prefill_logprobs = batch.prefill_next_token_indices is not None
 
-        if batch.needed_blocks_slots:
-            # Allocate blocks to this batch
-            CACHE_MANAGER.allocate(batch)
-
-        try:
-            out = self.forward(
-                batch.input_ids,
-                batch.position_ids,
-                batch.cu_seqlen_prefill,
-                batch.block_tables_tensor,
-                batch.slots[batch.slot_indices],
-                batch.input_lengths_tensor,
-                batch.max_seqlen,
-                batch.prefill_head_indices,
+        # Update adapter indices for speculative tokens (if present)
+        adapter_meta = batch.adapter_meta
+        if batch.speculative_ids is not None:
+            B, speculative_length = batch.speculative_ids.shape
+            new_length = speculative_length + 1
+            adapter_indices = (
+                adapter_meta.adapter_indices.unsqueeze(-1)
+                .expand(B, new_length)
+                .reshape(-1)
             )
-        except Exception as e:
-            del batch
-            raise e
+            adapter_segments = adapter_meta.adapter_segments * new_length
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_meta.adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_meta.segment_indices,
+            )
+
+        # Assign pointers to adapter weights
+        # TODO(travis): don't update this if indices haven't changed
+        adapter_data = AdapterBatchData.from_meta(
+            adapter_meta,
+            self.layer_to_adapter_weights,
+            prefill,
+            batch.prefill_head_indices,
+        )
+
+        out, speculative_logits = self.forward(batch, adapter_data)
 
         if prefill:
             next_token_logits = (
                 out[batch.prefill_next_token_indices] if prefill_logprobs else out
             )
+            if speculative_logits is not None:
+                speculative_logits = (
+                    speculative_logits[batch.prefill_next_token_indices]
+                    if prefill_logprobs
+                    else speculative_logits
+                )
+            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
+                len(batch)
+            )
+
         else:
             next_token_logits = out
+            next_adapter_indices = batch.adapter_meta.adapter_indices
 
-        next_input_ids, next_token_logprobs, logprobs = batch.next_token_chooser(
-            batch.all_input_ids_tensor[:, : batch.max_seqlen], next_token_logits
+        speculate = get_speculate()
+        (
+            next_input_ids,
+            next_token_logprobs,
+            logprobs,
+            accepted_ids,
+            speculative_ids,
+        ) = batch.next_token_chooser(
+            batch.all_input_ids_tensor[:, : batch.max_seqlen],
+            next_token_logits,
+            speculate,
+            batch.speculative_ids,
+            speculative_logits,
         )
 
         batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
-            batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs
+            batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
         )
 
         if prefill:
@@ -887,20 +1408,15 @@ class FlashCausalLM(Model):
         stopped = True
 
         # Zipped iterator
-        iterator = zip(
-            batch.input_lengths,
-            batch.all_input_ids,
-        )
+        iterator = zip(batch.input_lengths, batch.all_input_ids, accepted_ids)
 
         # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
         # one, we need to first do a GPU <-> CPU sync
         # It is faster if we delay this sync for the maximum amount of time
 
         # For each member of the batch
-        for i, (
-            input_length,
-            all_input_ids,
-        ) in enumerate(iterator):
+        index = 0
+        for i, (input_length, all_input_ids, n_accepted_ids) in enumerate(iterator):
             # Indexing metadata
             start_index = cumulative_length
             end_index = cumulative_length + input_length
@@ -915,28 +1431,47 @@ class FlashCausalLM(Model):
                 # In decode, we do not need this as we can just increment position ids
                 next_position_ids[i] = batch.position_ids[end_index - 1]
 
+                # Initialize adapter indices
+                # In decode, we only have one token per row in the batch, so grab last index
+                next_adapter_indices[i] = batch.adapter_meta.adapter_indices[
+                    end_index - 1
+                ]
+
                 # Used to gather prefill logprobs
                 # Copy batch.input_ids to prefill_token_indices
                 if prefill_logprobs:
                     if len(batch) > 1:
-                        prefill_tokens_indices[
-                            out_start_index : out_end_index - 1
-                        ] = batch.input_ids[start_index + 1 : start_index + out_length]
+                        prefill_tokens_indices[out_start_index : out_end_index - 1] = (
+                            batch.input_ids[start_index + 1 : start_index + out_length]
+                        )
                     else:
                         # Set prefill_tokens_indices to the correct slice
                         prefill_tokens_indices = batch.input_ids[
                             start_index + 1 : start_index + out_length
                         ]
 
-            batch.all_input_ids_tensor[i, input_length] = next_input_ids[i]
+            for j in range(n_accepted_ids):
+                batch.all_input_ids_tensor[i, input_length + j] = next_input_ids[index]
+                index += 1
 
             cumulative_length += input_length
 
-        # Set values in batch
-        batch.input_ids = next_input_ids
-        batch.position_ids = next_position_ids + 1
-        batch.input_lengths_tensor += 1
-        batch.slot_indices += 1
+        # Update values
+        batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
+        batch.speculative_ids = speculative_ids
+        batch.position_ids = next_position_ids + accepted_ids
+        batch.input_lengths_tensor += accepted_ids
+        batch.slot_indices += accepted_ids
+        batch.adapter_meta.adapter_indices = next_adapter_indices
+
+        if prefill:
+            # adjust segment lengths to account for all request lengths being 1 during decoding
+            adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices)
+            batch.adapter_meta.adapter_segments = torch.tensor(
+                adapter_segments,
+                dtype=torch.int32,
+                device=batch.adapter_meta.adapter_segments.device,
+            )
 
         if prefill and prefill_logprobs:
             # Get prefill logprobs
@@ -949,7 +1484,9 @@ class FlashCausalLM(Model):
 
         # GPU <-> CPU sync
         next_token_logprobs = next_token_logprobs.tolist()
-        next_token_ids = batch.input_ids.tolist()
+        next_token_ids = next_input_ids.tolist()
+        accepted_ids = accepted_ids.tolist()
+        start_decode = time.time_ns()
 
         # Zipped iterator
         iterator = zip(
@@ -962,13 +1499,13 @@ class FlashCausalLM(Model):
             batch.next_token_chooser.do_sample,
             batch.next_token_chooser.seeds,
             batch.top_n_tokens,
-            next_token_ids,
-            next_token_logprobs,
+            accepted_ids,
             batch_top_token_ids,
             batch_top_token_logprobs,
         )
 
         # For each member of the batch
+        index = 0
         for i, (
             request,
             input_length,
@@ -979,37 +1516,62 @@ class FlashCausalLM(Model):
             do_sample,
             seed,
             top_n_tokens,
-            next_token_id,
-            next_token_logprob,
+            n_accepted_ids,
             top_token_ids,
             top_token_logprobs,
         ) in enumerate(iterator):
             # Append next token to all tokens
-            all_input_ids.append(next_token_id)
+            next_token_texts = []
+            left = 0
 
-            # Generated token
-            next_token_text, prefix_offset, read_offset = self.decode_token(
-                all_input_ids,
-                prefix_offset,
-                read_offset,
-            )
+            if n_accepted_ids > 1:
+                if RANK == 0:
+                    logger.debug(f"Speculated ids {n_accepted_ids - 1}")
 
-            # Evaluate stopping criteria
-            stop, reason = stopping_criteria(
-                next_token_id,
-                next_token_text,
-            )
+            current_stopped = False
+            for j in range(index, index + n_accepted_ids):
+                # Generated token
+                next_token_id = next_token_ids[j]
+                all_input_ids.append(next_token_id)
+                next_token_text, prefix_offset, read_offset = self.decode_token(
+                    all_input_ids,
+                    prefix_offset,
+                    read_offset,
+                )
+                next_token_texts.append(next_token_text)
 
-            if not stop:
-                stopped = False
+                stop, reason = stopping_criteria(
+                    next_token_id,
+                    next_token_text,
+                )
+
+                if stop:
+                    left = index + n_accepted_ids - j - 1
+                    current_stopped = True
+                    break
+                else:
+                    current_stopped = False
+            stopped = stopped and current_stopped
+
+            _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
+            _next_token_logprobs = next_token_logprobs[
+                index : index + n_accepted_ids - left
+            ]
+            index += n_accepted_ids
 
             # Shard generations
             # All generations will be appended in the rust sharded client
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids,
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
                     )
                     generated_text = GeneratedText(
                         output_text,
@@ -1035,57 +1597,150 @@ class FlashCausalLM(Model):
                         clean_up_tokenization_spaces=False,
                         skip_special_tokens=False,
                     )
-                    prefill_tokens = PrefillTokens(
-                        prefill_token_ids, request_prefill_logprobs, prefill_texts
+
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        request_prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
                     )
                 else:
                     prefill_tokens = None
 
                 if top_n_tokens > 0:
-                    toptoken_texts = self.tokenizer.batch_decode(
-                        top_token_ids,
-                        clean_up_tokenization_spaces=False,
-                        skip_special_tokens=False,
-                    )
-                    special_toptokens = [
-                        token_id in self.all_special_ids for token_id in top_token_ids
-                    ]
-                    top_tokens = TopTokens(
-                        top_token_ids,
-                        top_token_logprobs,
-                        toptoken_texts,
-                        special_toptokens,
-                    )
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
                 else:
                     top_tokens = None
 
                 generation = Generation(
                     request.id,
                     prefill_tokens,
-                    next_token_id,
-                    next_token_logprob,
-                    next_token_text,
-                    next_token_id in self.all_special_ids,
+                    Tokens(
+                        _next_token_ids,
+                        _next_token_logprobs,
+                        next_token_texts,
+                        [nid in self.all_special_ids for nid in _next_token_ids],
+                    ),
                     generated_text,
                     top_tokens,
                 )
 
                 generations.append(generation)
 
+            # accept each new token for this specific request since we may
+            # have more than one new token per request with speculative decoding
+            for next_token_id in _next_token_ids:
+                batch.next_token_chooser = (
+                    batch.next_token_chooser.advance_grammar_single(i, next_token_id)
+                )
+
             # Update values
-            batch.input_lengths[i] = input_length + 1
+            batch.input_lengths[i] = input_length + n_accepted_ids
+            if batch.input_lengths[i] > batch.max_seqlen:
+                batch.max_seqlen = batch.input_lengths[i]
             batch.prefix_offsets[i] = prefix_offset
             batch.read_offsets[i] = read_offset
             batch.all_input_ids[i] = all_input_ids
 
         if stopped:
-            del batch
             # No need to return a batch if we know that all requests stopped
-            return generations, None
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
 
         batch.prefill_cu_outlens = None
         batch.prefill_head_indices = None
         batch.prefill_next_token_indices = None
-        batch.max_seqlen = batch.max_seqlen + 1
 
-        return generations, batch
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
+
+    @property
+    def supports_adapter_loading(self) -> bool:
+        return True
+
+    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
+        layer_weights = {}
+
+        prefix = "model.layers"
+
+        # This accounts for VLMs (e.g. LlavaNext, Idefics2)
+        # that have a language_model inside of the larger model.
+        if hasattr(self.model, "language_model"):
+            _model = self.model.language_model
+        elif hasattr(self.model, "text_model"):
+            _model = self.model.text_model
+        else:
+            _model = self.model
+
+        for i, layer in enumerate(_model.model.layers):
+            layer_weights[(i, "q_proj")] = (
+                f"{prefix}.{i}.self_attn.q_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "k_proj")] = (
+                f"{prefix}.{i}.self_attn.k_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "v_proj")] = (
+                f"{prefix}.{i}.self_attn.v_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "o_proj")] = (
+                f"{prefix}.{i}.self_attn.o_proj",
+                layer.self_attn.o_proj,
+            )
+
+            # TODO: this is a hack to avoid the gate_proj for
+            # FlashStarcoder2 that doesnt have these layers
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "gate_up_proj"):
+                layer_weights[(i, "gate_proj")] = (
+                    f"{prefix}.{i}.mlp.gate_proj",
+                    layer.mlp.gate_up_proj,
+                )
+                layer_weights[(i, "up_proj")] = (
+                    f"{prefix}.{i}.mlp.up_proj",
+                    layer.mlp.gate_up_proj,
+                )
+                layer_weights[(i, "down_proj")] = (
+                    f"{prefix}.{i}.mlp.down_proj",
+                    layer.mlp.down_proj,
+                )
+
+        layer_weights[(0, "lm_head")] = ("lm_head", _model.lm_head)
+        return layer_weights
+
+    @property
+    def adapter_layers(self) -> List[str]:
+        return ADAPTER_LAYERS
+
+    @property
+    def default_traced_adapter_layers(self) -> List[str]:
+        return ["q_proj", "v_proj"]
+
+    def get_num_layers_for_type(self, layer_type: str) -> int:
+        return 1 if layer_type == "lm_head" else len(self.model.model.layers)
+
+    def is_row_parallel(self, layer_type: str) -> bool:
+        return layer_type in ROW_PARALLEL
diff --git a/server/text_generation_server/models/flash_llama.py b/server/text_generation_server/models/flash_llama.py
deleted file mode 100644
index 063aa01e..00000000
--- a/server/text_generation_server/models/flash_llama.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoConfig, AutoTokenizer
-from transformers.models.llama import LlamaTokenizer
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_llama_modeling import (
-    FlashLlamaForCausalLM,
-    LlamaConfig,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashLlama(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashLlama is only available on GPU")
-
-        try:
-            tokenizer = LlamaTokenizer.from_pretrained(
-                model_id,
-                revision=revision,
-                padding_side="left",
-                truncation_side="left",
-                trust_remote_code=trust_remote_code,
-            )
-        except Exception:
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_id,
-                revision=revision,
-                padding_side="left",
-                truncation_side="left",
-                trust_remote_code=trust_remote_code,
-            )
-
-        config = LlamaConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = FlashLlamaForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashLlama, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
new file mode 100644
index 00000000..2b2bd2e0
--- /dev/null
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -0,0 +1,85 @@
+import torch
+from typing import Optional, Tuple, Dict, List
+
+from text_generation_server.models import FlashCausalLM
+
+
+ADAPTER_LAYERS = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
+
+
+class FlashMistral(FlashCausalLM):
+    @property
+    def supports_adapter_loading(self) -> bool:
+        return True
+
+    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
+        layer_weights = {}
+
+        prefix = "model.layers"
+
+        # This accounts for VLMs (e.g. LlavaNext, Idefics2)
+        # that have a language_model inside of the larger model.
+        if hasattr(self.model, "text_model"):
+            _model = self.model.text_model
+        else:
+            _model = self.model
+
+        for i, layer in enumerate(_model.model.layers):
+            layer_weights[(i, "q_proj")] = (
+                f"{prefix}.{i}.self_attn.q_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "k_proj")] = (
+                f"{prefix}.{i}.self_attn.k_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "v_proj")] = (
+                f"{prefix}.{i}.self_attn.v_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "o_proj")] = (
+                f"{prefix}.{i}.self_attn.o_proj",
+                layer.self_attn.o_proj,
+            )
+
+            # TODO: this is a hack to avoid the gate_proj for
+            # FlashStarcoder2 that doesnt have these layers
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "gate_up_proj"):
+                layer_weights[(i, "gate_proj")] = (
+                    f"{prefix}.{i}.mlp.gate_proj",
+                    layer.mlp.gate_up_proj,
+                )
+                layer_weights[(i, "up_proj")] = (
+                    f"{prefix}.{i}.mlp.up_proj",
+                    layer.mlp.gate_up_proj,
+                )
+                layer_weights[(i, "down_proj")] = (
+                    f"{prefix}.{i}.mlp.down_proj",
+                    layer.mlp.down_proj,
+                )
+
+        layer_weights[(0, "lm_head")] = ("lm_head", _model.lm_head)
+        return layer_weights
+
+    @property
+    def adapter_layers(self) -> List[str]:
+        return ADAPTER_LAYERS
+
+    @property
+    def default_traced_adapter_layers(self) -> List[str]:
+        return ["q_proj", "v_proj"]
+
+    def get_num_layers_for_type(self, layer_type: str) -> int:
+        return 1 if layer_type == "lm_head" else len(self.model.model.layers)
+
+    def is_row_parallel(self, layer_type: str) -> bool:
+        return layer_type in ROW_PARALLEL
diff --git a/server/text_generation_server/models/flash_neox.py b/server/text_generation_server/models/flash_neox.py
deleted file mode 100644
index 58f345a9..00000000
--- a/server/text_generation_server/models/flash_neox.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer, AutoConfig
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_neox_modeling import (
-    FlashGPTNeoXForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashNeoXSharded(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashNeoX is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = FlashGPTNeoXForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashNeoXSharded, self).__init__(
-            model=model.to(device),
-            tokenizer=tokenizer,
-            num_layers=len(model.gpt_neox.layers),
-            num_kv_heads=model.gpt_neox.num_heads,
-            head_size=model.gpt_neox.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_rw.py b/server/text_generation_server/models/flash_rw.py
deleted file mode 100644
index 2fc7c53d..00000000
--- a/server/text_generation_server/models/flash_rw.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer
-from typing import Optional
-
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_rw_modeling import (
-    RWConfig,
-    FlashRWForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashRWSharded(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashRW is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = RWConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device,
-            dtype,
-            process_group=self.process_group,
-            aliases={"lm_head.weight": ["transformer.word_embeddings.weight"]},
-        )
-
-        config.quantize = quantize
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = FlashRWForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashRWSharded, self).__init__(
-            model=model.to(device),
-            tokenizer=tokenizer,
-            num_layers=len(model.transformer.h),
-            num_kv_heads=model.transformer.cache_size,
-            head_size=model.transformer.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
diff --git a/server/text_generation_server/models/flash_santacoder.py b/server/text_generation_server/models/flash_santacoder.py
deleted file mode 100644
index 29505902..00000000
--- a/server/text_generation_server/models/flash_santacoder.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import torch
-import torch.distributed
-
-from opentelemetry import trace
-from transformers import AutoTokenizer, AutoConfig
-from typing import Optional, List
-import json
-import os
-
-from huggingface_hub import hf_hub_download
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_santacoder_modeling import (
-    FlashSantacoderForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class FlashSantacoderSharded(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashSantacoderSharded is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=True,
-        )
-        config.quantize = quantize
-        config.transpose = config.architectures[0].startswith("GPT2")
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-            aliases={"transformer.wte.weight": ["lm_head.weight"]},
-        )
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = FlashSantacoderForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashSantacoderSharded, self).__init__(
-            model=model.to(device),
-            tokenizer=tokenizer,
-            num_layers=len(model.transformer.h),
-            num_kv_heads=1,
-            head_size=model.transformer.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def decode(self, generated_ids: List[int]) -> str:
-        # Do not skip special tokens as they are used for custom parsing rules of the generated text
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-        )
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index d4211734..2d43244a 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -20,6 +20,7 @@ from text_generation_server.utils import (
     weight_files,
     Weights,
 )
+from text_generation_server.utils.chunks import concat_text_chunks
 
 # CREDIT: Papers with code => https://github.com/paperswithcode/galai/blob/main/galai/utils.py
 
@@ -80,6 +81,7 @@ class GalacticaCausalLMBatch(CausalLMBatch):
         next_token_choosers = []
         stopping_criterias = []
         prefix_offsets = []
+        top_n_tokens = []
         read_offsets = []
         requests_idx_mapping = {}
 
@@ -90,12 +92,17 @@ class GalacticaCausalLMBatch(CausalLMBatch):
         for i, r in enumerate(pb.requests):
             requests_idx_mapping[r.id] = i
             # Add escape_custom_split_sequence to the CausalLMBatch logic
-            inputs.append(escape_custom_split_sequence(r.inputs))
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
+            inputs.append(
+                escape_custom_split_sequence(concat_text_chunks(r.input_chunks.chunks))
+            )
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
             stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
             max_truncation = max(max_truncation, r.truncate)
             max_decode_tokens += stopping_criteria.max_new_tokens
             padding_right_offset = max(
@@ -129,6 +136,9 @@ class GalacticaCausalLMBatch(CausalLMBatch):
         position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
         position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
         all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
 
         max_tokens = len(inputs) * max_input_length + max_decode_tokens
 
@@ -146,84 +156,9 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             read_offsets=read_offsets,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
             max_input_length=max_input_length.item(),
             padding_right_offset=padding_right_offset,
             max_tokens=max_tokens,
         )
-
-
-class GalacticaSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            tp_parallel=True,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        tokenizer.pad_token_id = config.pad_token_id
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = OPTForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    @property
-    def batch_type(self) -> Type[CausalLMBatch]:
-        return GalacticaCausalLMBatch
-
-    def decode(self, generated_ids: List[int]) -> str:
-        # Do not skip special tokens as they are used for custom parsing rules of the generated text
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        outputs = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-        return outputs.logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py
new file mode 100644
index 00000000..06035ccd
--- /dev/null
+++ b/server/text_generation_server/models/globals.py
@@ -0,0 +1,55 @@
+import torch
+import os
+from loguru import logger
+from typing import Dict
+
+MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
+# This is overridden by the cli
+FLASH_DECODING = os.getenv("FLASH_DECODING") in {"1", "true", "True"}
+BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
+if FLASH_DECODING:
+    logger.info("Using FLASH_DECODING")
+
+
+cuda_graphs = os.getenv("CUDA_GRAPHS")
+if cuda_graphs is not None:
+    try:
+        cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
+    except Exception as e:
+        raise RuntimeError(
+            f"Could not parse cuda graphs {cuda_graphs}, expected comma separated list for batch sizes to run on: {e}"
+        )
+else:
+    cuda_graphs = None
+# sorting the cuda graphs in descending order helps reduce the
+# memory impact and results in less memory usage
+if cuda_graphs is not None:
+    cuda_graphs.sort(reverse=True)
+
+
+CUDA_GRAPHS = cuda_graphs
+
+# This is overridden at model loading.
+global MODEL_ID
+MODEL_ID = None
+
+
+def set_model_id(model_id: str):
+    global MODEL_ID
+    MODEL_ID = model_id
+
+
+# NOTE: eventually we should move this into the router and pass back the
+# index in all cases.
+global ADAPTER_TO_INDEX
+ADAPTER_TO_INDEX: Dict[str, int] = None
+
+
+def set_adapter_to_index(adapter_to_index: Dict[str, int]):
+    global ADAPTER_TO_INDEX
+    ADAPTER_TO_INDEX = adapter_to_index
+
+
+def get_adapter_to_index():
+    global ADAPTER_TO_INDEX
+    return ADAPTER_TO_INDEX
diff --git a/server/text_generation_server/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
deleted file mode 100644
index accedf14..00000000
--- a/server/text_generation_server/models/gpt_neox.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import Optional
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-)
-from text_generation_server.models import CausalLM
-from text_generation_server.models.custom_modeling.neox_modeling import (
-    GPTNeoxForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class GPTNeoxSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = GPTNeoxForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        outputs = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        logits = outputs.logits
-        return logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/idefics.py b/server/text_generation_server/models/idefics.py
index c54b539b..f2955bd0 100644
--- a/server/text_generation_server/models/idefics.py
+++ b/server/text_generation_server/models/idefics.py
@@ -31,6 +31,7 @@ class IDEFICSSharded(IdeficsCausalLM):
         model_id: str,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
@@ -39,10 +40,10 @@ class IDEFICSSharded(IdeficsCausalLM):
             device = torch.device(f"cuda:{rank}")
             # 9b seems to work correctly enough in float16, but 80b seems
             # to be really saturating for f16.
-            dtype = torch.bfloat16 if dtype is None else dtype
+            dtype = torch.float16 if dtype is None else dtype
         else:
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
         self.device, self.dtype = device, dtype
 
         config = IdeficsConfig.from_pretrained(
@@ -51,6 +52,7 @@ class IDEFICSSharded(IdeficsCausalLM):
             trust_remote_code=trust_remote_code,
         )
         config.quantize = quantize
+        config.speculator = speculator
         config.vision_config.quantize = quantize
 
         tokenizer = LlamaTokenizerFast.from_pretrained(
@@ -81,6 +83,7 @@ class IDEFICSSharded(IdeficsCausalLM):
 
         torch.distributed.barrier(group=self.process_group)
         super(IdeficsCausalLM, self).__init__(
+            model_id=model_id,
             model=model,
             tokenizer=tokenizer,
             requires_padding=True,
diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
index ae2b8089..6c562980 100644
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -1,45 +1,28 @@
-import torch
-import inspect
-import re
 from io import BytesIO
-import base64
 from PIL import Image
-import re
+import torch
+import time
 
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase, ProcessorMixin
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+)
 from typing import Optional, Tuple, List, Type, Dict
 
 from text_generation_server.models import Model
 from text_generation_server.models.types import (
     Batch,
-    PrefillTokens,
+    Tokens,
     Generation,
     GeneratedText,
 )
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
 
-import re
-
-IMAGES = re.compile(r'!\[[^\]]*\]\((.*?)\s*(\"(?:.*[^\"])\")?\s*\)')
-
-def split(string):
-    parts = []
-    cursor = 0
-    for pattern in IMAGES.finditer(string):
-        start = pattern.start()
-        if start != cursor:
-            parts.append(string[cursor:start])
-
-        parts.append(pattern.group(1))
-        cursor = pattern.end()
-
-    if cursor != len(string):
-        parts.append(string[cursor:])
-
-    return parts
 
 tracer = trace.get_tracer(__name__)
 
@@ -94,7 +77,18 @@ class IdeficsCausalLMBatch(Batch):
         cls,
         pb: generate_pb2.Batch,
         tokenizer: PreTrainedTokenizerBase,
-        processor: ProcessorMixin, # Hack
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "IdeficsCausalLMBatch":
+        raise NotImplementedError
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor: ProcessorMixin,  # Hack
+        config,
         dtype: torch.dtype,
         device: torch.device,
     ) -> "IdeficsCausalLMBatch":
@@ -111,8 +105,10 @@ class IdeficsCausalLMBatch(Batch):
         max_decode_tokens = 0
         for i, r in enumerate(pb.requests):
             requests_idx_mapping[r.id] = i
-            inputs.append(r.inputs)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
+            inputs.append(r.input_chunks.chunks)
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
@@ -123,10 +119,21 @@ class IdeficsCausalLMBatch(Batch):
                 padding_right_offset, stopping_criteria.max_new_tokens
             )
 
+        # TODO Check impact on idefics
         prompts = []
         for inp in inputs:
             # Each input is encoded into a list, where each element of this input list is either a string or a URL
-            prompts.append(split(inp))
+            prompt = []
+            for chunk in inp:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    prompt.append(chunk.text)
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    prompt.append(image)
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+            prompts.append(prompt)
 
         # The processor replaces the call to tokenizer, and
         # a/ takes care of fetching images from the URL
@@ -137,18 +144,23 @@ class IdeficsCausalLMBatch(Batch):
             padding=True,
             truncation=True,
             max_length=max_truncation,
-            add_end_of_utterance_token=False, # Already taken care of inside the prompts, so bypassing the processor's handling of this token
+            # TODO Check impact on idefics
+            # add_end_of_utterance_token=False,  # Already taken care of inside the prompts, so bypassing the processor's handling of this token
         ).to(device)
         for _ in pb.requests:
             input_len = tokenized_inputs["input_ids"].shape[1]
-            prefix_offsets.append(input_len - 5) # To decode without potential fallbacks errors
-            read_offsets.append(input_len) # To decode without potential fallbacks errors
+            prefix_offsets.append(
+                input_len - 5
+            )  # To decode without potential fallbacks errors
+            read_offsets.append(
+                input_len
+            )  # To decode without potential fallbacks errors
 
         input_lengths = tokenized_inputs["attention_mask"].sum(1)
         max_input_length = input_lengths.max()
 
         input_ids = tokenized_inputs["input_ids"]
-        pixel_values = tokenized_inputs["pixel_values"]
+        pixel_values = tokenized_inputs.get("pixel_values", None)
         image_hidden_states = None
         # Allocate maximum attention_mask
         attention_mask = input_ids.new_zeros(
@@ -157,15 +169,25 @@ class IdeficsCausalLMBatch(Batch):
         # Copy tokenizer attention_mask into fully allocated attention_mask
         attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]
         # Do the same for image_attention_mask
-        image_attention_mask = input_ids.new_zeros(
-            (pb.size, max_input_length + padding_right_offset, tokenized_inputs["pixel_values"].size(1))
-        )
-        image_attention_mask[:, :max_input_length, :] = tokenized_inputs["image_attention_mask"]
-
+        if pixel_values is None:
+            image_attention_mask = None
+        else:
+            image_attention_mask = input_ids.new_zeros(
+                (
+                    pb.size,
+                    max_input_length + padding_right_offset,
+                    pixel_values.size(1),
+                )
+            )
+            image_attention_mask[:, :max_input_length, :] = tokenized_inputs[
+                "image_attention_mask"
+            ]
 
         position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
         position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
-        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1) # It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
+        all_input_ids = tokenized_inputs["input_ids"].T.split(
+            1, dim=1
+        )  # It's input_ids but splitted into a tuple of tensors where each tensor is (seq_len, 1) size. It is then transformed into a list
 
         max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
 
@@ -259,7 +281,7 @@ class IdeficsCausalLMBatch(Batch):
                 self.image_attention_mask.shape[1] - self.padding_right_offset
             )
             + new_padding_right_offset,
-            :
+            :,
         ]
         if self.image_hidden_states is None:
             image_hidden_states = None
@@ -308,7 +330,9 @@ class IdeficsCausalLMBatch(Batch):
 
     @classmethod
     @tracer.start_as_current_span("concatenate")
-    def concatenate(cls, batches: List["IdeficsCausalLMBatch"]) -> "IdeficsCausalLMBatch":
+    def concatenate(
+        cls, batches: List["IdeficsCausalLMBatch"]
+    ) -> "IdeficsCausalLMBatch":
         # It adds new requests to the batch
         # Used for padding
         total_batch_size = 0
@@ -383,12 +407,20 @@ class IdeficsCausalLMBatch(Batch):
 
             curr_batch_max_num_images = batch.pixel_values.size(1)
             if pixel_values is None:
-                pixel_values = batch.pixel_values.new_zeros((total_batch_size, max_num_images, 3, 224, 224))
-            pixel_values[start_index:end_index, :curr_batch_max_num_images] = batch.pixel_values
+                pixel_values = batch.pixel_values.new_zeros(
+                    (total_batch_size, max_num_images, 3, 224, 224)
+                )
+            pixel_values[start_index:end_index, :curr_batch_max_num_images] = (
+                batch.pixel_values
+            )
 
             if image_attention_mask is None:
                 image_attention_mask = batch.image_attention_mask.new_zeros(
-                    (total_batch_size, max_input_length + padding_right_offset, max_num_images)
+                    (
+                        total_batch_size,
+                        max_input_length + padding_right_offset,
+                        max_num_images,
+                    )
                 )
 
             # We need to slice the attention mask to remove padding from previous steps
@@ -409,11 +441,9 @@ class IdeficsCausalLMBatch(Batch):
             image_attention_mask[
                 start_index:end_index,
                 left_offset:-padding_right_offset,
-                :curr_batch_max_num_images
+                :curr_batch_max_num_images,
             ] = batch.image_attention_mask[
-                :,
-                batch_left_offset : - batch.padding_right_offset,
-                :
+                :, batch_left_offset : -batch.padding_right_offset, :
             ]
 
             # Create empty tensor
@@ -479,14 +509,14 @@ class IdeficsCausalLMBatch(Batch):
                 # We slice the keys to remove the padding from previous batches
                 past_seq_len = batch.max_input_length - 1
                 if batch.keys_head_dim_last:
-                    padded_past_keys[
-                        start_index:end_index, :, -past_seq_len:, :
-                    ] = past_keys[:, :, -past_seq_len:, :]
+                    padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
+                        past_keys[:, :, -past_seq_len:, :]
+                    )
                 else:
                     # BLOOM case
-                    padded_past_keys[
-                        start_index:end_index, :, :, -past_seq_len:
-                    ] = past_keys[:, :, :, -past_seq_len:]
+                    padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
+                        past_keys[:, :, :, -past_seq_len:]
+                    )
                 del past_keys
 
                 start_index = end_index
@@ -504,9 +534,9 @@ class IdeficsCausalLMBatch(Batch):
                 end_index = start_index + len(batch)
                 # We slice the past values to remove the padding from previous batches
                 past_seq_len = batch.max_input_length - 1
-                padded_past_values[
-                    start_index:end_index, :, -past_seq_len:, :
-                ] = past_values[:, :, -past_seq_len:, :]
+                padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
+                    past_values[:, :, -past_seq_len:, :]
+                )
                 del past_values
 
                 # Update values
@@ -550,17 +580,19 @@ class IdeficsCausalLM(Model):
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        from text_generation_server.models.custom_modeling.idefics_modeling import IdeficsForVisionText2Text
+        from text_generation_server.models.custom_modeling.idefics_modeling import (
+            IdeficsForVisionText2Text,
+        )
 
         if torch.cuda.is_available():
             device = torch.device("cuda")
-            dtype = torch.float16 if dtype is None else dtype
+            dtype = torch.bfloat16 if dtype is None else dtype
         else:
             if quantize:
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         tokenizer = AutoTokenizer.from_pretrained(
             model_id,
@@ -580,9 +612,11 @@ class IdeficsCausalLM(Model):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
@@ -600,6 +634,7 @@ class IdeficsCausalLM(Model):
                 tokenizer.add_special_tokens({"pad_token": "<unk>"})
 
         super(IdeficsCausalLM, self).__init__(
+            model_id=model_id,
             model=model,
             tokenizer=tokenizer,
             requires_padding=True,
@@ -611,11 +646,6 @@ class IdeficsCausalLM(Model):
     def batch_type(self) -> Type[IdeficsCausalLMBatch]:
         return IdeficsCausalLMBatch
 
-    def decode(self, generated_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
     def forward(
         self,
         input_ids,
@@ -640,26 +670,39 @@ class IdeficsCausalLM(Model):
         if self.has_position_ids:
             kwargs["position_ids"] = position_ids
 
-        outputs = self.model.forward(**kwargs)
-        return outputs.logits, outputs.past_key_values, outputs.image_hidden_states
+        outputs, speculative_logits = self.model.forward(**kwargs)
+        return (
+            outputs.logits,
+            speculative_logits,
+            outputs.past_key_values,
+            outputs.image_hidden_states,
+        )
 
     @tracer.start_as_current_span("generate_token")
     def generate_token(
         self, batch: IdeficsCausalLMBatch
-    ) -> Tuple[List[Generation], Optional[IdeficsCausalLMBatch]]:
+    ) -> Tuple[List[Generation], Optional[IdeficsCausalLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
         # slice the attention mask to the correct shape
         attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
-        if batch.input_ids.size(1) == 1:
-            # THIS is a hack: when calling idefics.generate, the first time, we need the whole image_attention_mask (size bs x max_seq_len x max_num_images),
-            # but the subsequent times, we only need the last attention mask along the `max_seq_len` dimension
-            # this is due to the nature IDEFICS: it's an encoder decoder, and so when decoding, only the currently generated
-            # token need to attend to the encoder hidden states (i.e. the vision encoder)
-            # Also see seq2seq_lm.Seq2SeqLM.generate_token which has roughly the same logic
-            image_attention_mask = batch.image_attention_mask[:, -(batch.padding_right_offset+1)].unsqueeze(1)
+        if batch.image_attention_mask is None:
+            image_attention_mask = None
         else:
-            image_attention_mask = batch.image_attention_mask[:, : -batch.padding_right_offset]
+            if batch.input_ids.size(1) == 1:
+                # THIS is a hack: when calling idefics.generate, the first time, we need the whole image_attention_mask (size bs x max_seq_len x max_num_images),
+                # but the subsequent times, we only need the last attention mask along the `max_seq_len` dimension
+                # this is due to the nature IDEFICS: it's an encoder decoder, and so when decoding, only the currently generated
+                # token need to attend to the encoder hidden states (i.e. the vision encoder)
+                # Also see seq2seq_lm.Seq2SeqLM.generate_token which has roughly the same logic
+                image_attention_mask = batch.image_attention_mask[
+                    :, -(batch.padding_right_offset + 1)
+                ].unsqueeze(1)
+            else:
+                image_attention_mask = batch.image_attention_mask[
+                    :, : -batch.padding_right_offset
+                ]
 
-        logits, past, image_hidden_states = self.forward(
+        logits, speculative_logits, past, image_hidden_states = self.forward(
             input_ids=batch.input_ids,
             attention_mask=attention_mask,
             position_ids=batch.position_ids,
@@ -671,6 +714,8 @@ class IdeficsCausalLM(Model):
         # Hardcoded remove image tokens
         logits[:, 32000:32001] = torch.finfo(logits.dtype).min
 
+        start_decode = time.time_ns()
+
         # Results
         generations: List[Generation] = []
         stopped = True
@@ -728,8 +773,14 @@ class IdeficsCausalLM(Model):
             if i % self.world_size == self.rank:
                 if stop:
                     # Decode generated tokens
-                    output_text = self.decode(
-                        all_input_ids[-stopping_criteria.current_tokens :, 0]
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
                     )
                     # Get seed
                     if isinstance(next_token_chooser.choice, Sampling):
@@ -757,21 +808,26 @@ class IdeficsCausalLM(Model):
                         clean_up_tokenization_spaces=False,
                         skip_special_tokens=False,
                     )
-                    prefill_tokens = PrefillTokens(
-                        prefill_token_ids, prefill_logprobs, prefill_texts
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
                     )
                 else:
                     prefill_tokens = None
 
-                top_tokens=None
+                top_tokens = None
 
                 generation = Generation(
                     request.id,
                     prefill_tokens,
-                    next_token_id_squeezed,
-                    next_token_logprob,
-                    next_token_text,
-                    next_token_id_squeezed.item() in self.all_special_ids,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
                     generated_text,
                     top_tokens,
                 )
@@ -779,6 +835,9 @@ class IdeficsCausalLM(Model):
                 generations.append(generation)
 
             # Update values
+            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
+                next_token_id_squeezed.item()
+            )
             batch.input_ids[i, 0] = next_token_id
             batch.all_input_ids[i] = all_input_ids
             batch.input_lengths[i] = new_input_length
@@ -788,14 +847,18 @@ class IdeficsCausalLM(Model):
 
         # We finished all generations in the batch; there is no next batch
         if stopped:
-            return generations, None
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
 
         # Slice unused values from prefill
         batch.input_ids = batch.input_ids[:, :1]
 
         # Update attention_mask as we added a new token to input_ids
         batch.attention_mask[:, -batch.padding_right_offset] = 1
-        batch.image_attention_mask[:, -batch.padding_right_offset, :] = batch.image_attention_mask[:, -(batch.padding_right_offset+1), :]
+        batch.image_attention_mask[:, -batch.padding_right_offset, :] = (
+            batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
+        )
         # Decrease right offset
         batch.padding_right_offset -= 1
 
@@ -806,4 +869,6 @@ class IdeficsCausalLM(Model):
         batch.past_key_values = past
         batch.image_hidden_states = image_hidden_states
 
-        return generations, batch
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py
new file mode 100644
index 00000000..9189b45c
--- /dev/null
+++ b/server/text_generation_server/models/mamba.py
@@ -0,0 +1,805 @@
+import torch
+import torch.distributed
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+from typing import Optional
+import os
+from text_generation_server.models.custom_modeling.mamba_modeling import (
+    MambaConfig,
+)
+from loguru import logger
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.models.globals import CUDA_GRAPHS, MEM_POOL
+import time
+from text_generation_server.models.custom_modeling.mamba_modeling import (
+    MambaModel,
+    InferenceParams,
+)
+from text_generation_server.models import Model
+from typing import Any, List, Optional, Tuple, Type, Dict
+from text_generation_server.models.types import (
+    Batch,
+    Tokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.tokens import batch_top_tokens, Sampling
+from dataclasses import dataclass
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
+
+
+def new_inference_params(
+    n_blocks: int,
+    batch_size: int,
+    d_inner: int,
+    d_conv: int,
+    d_state: int,
+    seqlen_offset: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
+    max_seqlen = 0
+    conv_states = torch.zeros(
+        (
+            n_blocks,
+            batch_size,
+            d_inner,
+            d_conv,
+        ),
+        device=device,
+        dtype=dtype,
+    )
+    ssm_states = torch.zeros(
+        (
+            n_blocks,
+            batch_size,
+            d_inner,
+            d_state,
+        ),
+        device=device,
+        dtype=dtype,
+    )
+    inference_params = InferenceParams(
+        max_seqlen=max_seqlen,
+        max_batch_size=batch_size,
+        seqlen_offset=seqlen_offset,
+        conv_states=conv_states,
+        ssm_states=ssm_states,
+    )
+    return inference_params
+
+
+@dataclass
+class MambaBatch(Batch):
+    batch_id: int
+    requests: List[generate_pb2.Request]
+    requests_idx_mapping: Dict[int, int]
+
+    # Decoder values
+    input_ids: torch.Tensor
+
+    # All tokens
+    all_input_ids: List[torch.Tensor]
+
+    # Lengths of all generations present in the batch
+    input_lengths: List[int]
+    prefix_offsets: List[int]
+    read_offsets: List[int]
+
+    # Generation helpers
+    next_token_choosers: List[NextTokenChooser]
+    stopping_criterias: List[StoppingCriteria]
+    top_n_tokens: List[int]
+    top_n_tokens_tensor: torch.Tensor
+
+    # Metadata used for padding
+    max_input_length: int
+    padding_right_offset: int
+
+    # Maximum number of tokens this batch will grow to
+    max_tokens: int
+
+    # Past metadata
+    keys_head_dim_last: bool = True
+
+    # Inference params
+    inference_params: Optional[Dict[str, Any]] = None
+
+    def to_pb(self) -> generate_pb2.CachedBatch:
+        return generate_pb2.CachedBatch(
+            id=self.batch_id,
+            request_ids=[r.id for r in self.requests],
+            size=len(self),
+            max_tokens=self.max_tokens,
+        )
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "MambaBatch":
+        inputs = []
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+        prefix_offsets = []
+        read_offsets = []
+        requests_idx_mapping = {}
+
+        # Parse batch
+        max_truncation = 0
+        padding_right_offset = 0
+        max_decode_tokens = 0
+        for i, r in enumerate(pb.requests):
+            requests_idx_mapping[r.id] = i
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(r.top_n_tokens)
+            max_truncation = max(max_truncation, r.truncate)
+            max_decode_tokens += stopping_criteria.max_new_tokens
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
+            )
+
+        tokenized_inputs = tokenizer(
+            inputs,
+            return_tensors="pt",
+            padding=True,
+            return_token_type_ids=False,
+            truncation=True,
+            max_length=max_truncation,
+        ).to(device)
+        for _ in pb.requests:
+            input_len = tokenized_inputs["input_ids"].shape[1]
+            prefix_offsets.append(input_len - 5)
+            read_offsets.append(input_len)
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+        input_ids = tokenized_inputs["input_ids"]
+        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
+        top_n_tokens_tensor = torch.tensor(
+            top_n_tokens, device=device, dtype=torch.int64
+        )
+        max_tokens = len(inputs) * (max_input_length + max_decode_tokens)
+        return cls(
+            batch_id=pb.id,
+            requests=pb.requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            # past_input_ids=None,
+            all_input_ids=list(all_input_ids),
+            input_lengths=input_lengths.tolist(),
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length.item(),
+            padding_right_offset=padding_right_offset,
+            max_tokens=max_tokens,
+        )
+
+    def filter(self, request_ids: List[int]) -> Optional["MambaBatch"]:
+        if len(request_ids) == 0:
+            raise ValueError("Batch must have at least one request")
+        if len(request_ids) == len(self):
+            return self
+
+        keep_indices = []
+
+        # New values after filtering
+        requests_idx_mapping = {}
+        requests = []
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        max_input_length = 0
+
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+
+        total_remaining_decode_tokens = 0
+        new_padding_right_offset = 0
+
+        indices = []
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            requests_idx_mapping[request_id] = i
+            keep_indices.append(idx)
+
+            requests.append(self.requests[idx])
+            prefix_offsets.append(self.prefix_offsets[idx])
+            read_offsets.append(self.read_offsets[idx])
+            all_input_ids.append(self.all_input_ids[idx])
+
+            request_input_length = self.input_lengths[idx]
+            input_lengths.append(request_input_length)
+            max_input_length = max(max_input_length, request_input_length)
+            indices.append(idx)
+
+            next_token_choosers.append(self.next_token_choosers[idx])
+            stopping_criteria = self.stopping_criterias[idx]
+            stopping_criterias.append(stopping_criteria)
+            top_n_tokens.append(self.top_n_tokens[idx])
+            remaining_decode_tokens = (
+                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
+            )
+            total_remaining_decode_tokens += remaining_decode_tokens
+            new_padding_right_offset = max(
+                new_padding_right_offset, remaining_decode_tokens
+            )
+
+        # Apply indices to input_ids, attention mask, past key values and other items that need to be cached
+        input_ids = self.input_ids[keep_indices]
+
+        top_n_tokens_tensor = self.top_n_tokens_tensor[keep_indices]
+        max_tokens = len(request_ids) * max_input_length + total_remaining_decode_tokens
+
+        self.requests = requests
+        self.requests_idx_mapping = requests_idx_mapping
+        self.input_ids = input_ids
+        self.all_input_ids = all_input_ids
+        self.input_lengths = input_lengths
+        self.prefix_offsets = prefix_offsets
+        self.read_offsets = read_offsets
+        self.next_token_choosers = next_token_choosers
+        self.stopping_criterias = stopping_criterias
+        self.top_n_tokens = top_n_tokens
+        self.top_n_tokens_tensor = top_n_tokens_tensor
+        self.max_input_length = max_input_length
+        self.padding_right_offset = new_padding_right_offset
+        self.max_tokens = max_tokens
+
+        # TODO
+        # Kept it simple by just updating the state, maybe updating the other CPU values is necessary.
+        self.inference_params.conv_states = self.inference_params.conv_states[
+            :, indices
+        ]
+        self.inference_params.ssm_states = self.inference_params.ssm_states[:, indices]
+        return self
+
+    @classmethod
+    def concatenate(cls, batches: List["MambaBatch"]) -> "MambaBatch":
+        # Used for padding
+        total_batch_size = 0
+        max_input_length = 0
+        padding_right_offset = 0
+        for batch in batches:
+            total_batch_size += len(batch)
+            max_input_length = max(max_input_length, batch.max_input_length)
+            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)
+
+        # Batch attributes
+        requests = []
+        requests_idx_mapping = {}
+        input_lengths = []
+        prefix_offsets = []
+        read_offsets = []
+        all_input_ids = []
+        next_token_choosers = []
+        stopping_criterias = []
+        top_n_tokens = []
+        max_tokens = 0
+        max_seqlen = 0
+        seqlen_offset = 0
+
+        (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
+        (_, _, _, d_state) = batches[0].inference_params.ssm_states.shape
+        dtype = batches[0].inference_params.conv_states.dtype
+        device = batches[0].inference_params.conv_states.device
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=total_batch_size,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=device,
+            dtype=dtype,
+        )
+
+        # Batch tensors
+        input_ids = None
+        top_n_tokens_tensor = None
+
+        # Used for slicing correctly inside the tensors
+        # Equivalent to a cumsum on batch sizes
+        start_index = 0
+        for i, batch in enumerate(batches):
+            requests.extend(batch.requests)
+            input_lengths.extend(batch.input_lengths)
+            prefix_offsets.extend(batch.prefix_offsets)
+            read_offsets.extend(batch.read_offsets)
+            all_input_ids.extend(batch.all_input_ids)
+            next_token_choosers.extend(batch.next_token_choosers)
+            stopping_criterias.extend(batch.stopping_criterias)
+            top_n_tokens.extend(batch.top_n_tokens)
+
+            if i == 0:
+                requests_idx_mapping = batch.requests_idx_mapping
+            else:
+                # We need to offset the mapping for each batch by the cumulative batch size
+                for k, v in batch.requests_idx_mapping.items():
+                    requests_idx_mapping[k] = v + start_index
+
+            # Slicing end index for this batch
+            end_index = start_index + len(batch)
+
+            # Create empty tensor
+            # input_ids is always of shape [batch_size, 1]
+            # We do not need to pad it
+            if input_ids is None:
+                input_ids = batch.input_ids.new_empty((total_batch_size, 1))
+            # Copy to correct indices
+            input_ids[start_index:end_index] = batch.input_ids
+
+            if top_n_tokens_tensor is None:
+                top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
+                    total_batch_size,
+                )
+            top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
+
+            # Add eventual padding tokens that were added while concatenating
+            max_tokens += batch.max_tokens + (
+                max_input_length - batch.max_input_length
+            ) * len(batch)
+
+            inference_params.max_seqlen = max(
+                inference_params.max_seqlen, batch.inference_params.max_seqlen
+            )
+            assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset"
+            inference_params.seqlen_offset = max(
+                inference_params.seqlen_offset, batch.inference_params.seqlen_offset
+            )
+
+            inference_params.conv_states[:, start_index:end_index] = (
+                batch.inference_params.conv_states
+            )
+            inference_params.ssm_states[:, start_index:end_index] = (
+                batch.inference_params.ssm_states
+            )
+
+            start_index = end_index
+
+        return cls(
+            batch_id=batches[0].batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            all_input_ids=all_input_ids,
+            input_lengths=input_lengths,
+            prefix_offsets=prefix_offsets,
+            read_offsets=read_offsets,
+            next_token_choosers=next_token_choosers,
+            stopping_criterias=stopping_criterias,
+            top_n_tokens=top_n_tokens,
+            top_n_tokens_tensor=top_n_tokens_tensor,
+            max_input_length=max_input_length,
+            padding_right_offset=padding_right_offset,
+            keys_head_dim_last=batches[0].keys_head_dim_last,
+            max_tokens=max_tokens,
+            inference_params=inference_params,
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+
+class Mamba(Model):
+    def __init__(
+        self,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        trust_remote_code: bool = False,
+    ):
+        self.process_group, _rank, world_size = initialize_torch_distributed()
+        if world_size > 1:
+            raise RuntimeError("Mamba does not support Tensor Parallelism (TP)")
+        self.cuda_graphs = {}
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            # Bf16 is important. In f16 accumulations in the matmul are causing
+            # differences while the server is under load.
+            # This is detectable by the integration load test
+            dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            if quantize:
+                raise ValueError("quantization is not available on CPU")
+
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            "EleutherAI/gpt-neox-20b",
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        config = MambaConfig.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+
+        tokenizer.bos_token_id = config.bos_token_id
+        tokenizer.eos_token_id = config.eos_token_id
+        tokenizer.pad_token = tokenizer.eos_token
+
+        config.quantize = quantize
+        config.speculator = speculator
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(filenames, device, dtype, process_group=self.process_group)
+        model = MambaModel(config, weights)
+        torch.distributed.barrier(group=self.process_group)
+        super(Mamba, self).__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+        )
+
+    @property
+    def batch_type(self) -> Type[MambaBatch]:
+        return MambaBatch
+
+    def warmup(self, batch) -> Optional[int]:
+        # TODO: implement warmup for Mamba if needed
+        if CUDA_GRAPHS:
+            if self.speculate is None or self.speculate == 0:
+                try:
+                    logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
+                    # Warmup cuda graphs
+                    for bs in CUDA_GRAPHS:
+                        self.cuda_graph_warmup(bs)
+                except Exception:
+                    logger.exception(f"Decode cuda graph warmup failed")
+        else:
+            logger.info(f"Cuda Graphs are disabled (CUDA_GRAPHS={CUDA_GRAPHS}).")
+
+        return None
+
+    def cuda_graph_warmup(self, batch_size: int):
+        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
+        n_blocks = len(self.model.blocks)
+
+        d_state = self.model.config.d_state
+        d_conv = self.model.config.d_conv
+        # Inner takes the expand multiplication
+        d_inner = self.model.config.d_inner
+
+        # Important seqlen_offset to go through the update mecanism with the state
+        seqlen_offset = 1
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=batch_size,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=self.device,
+            dtype=self.dtype,
+        )
+
+        graph = torch.cuda.CUDAGraph()
+
+        torch.cuda.synchronize()
+        # Run once outside to warmup
+        self.model.forward(input_ids=input_ids, inference_params=inference_params)
+        torch.cuda.synchronize()
+
+        with torch.cuda.graph(graph, pool=MEM_POOL):
+            logits, speculative_logits = self.model.forward(
+                input_ids=input_ids, inference_params=inference_params
+            )
+        torch.cuda.synchronize()
+        graph_dict = {
+            "input_ids": input_ids,
+            "inference_params": inference_params,
+            "graph": graph,
+            "logits": logits,
+            "speculative_logits": speculative_logits,
+        }
+        self.cuda_graphs[batch_size] = graph_dict
+
+    def tunableop_warmup(self, seqlen: int):
+        input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
+        n_blocks = len(self.model.blocks)
+
+        d_state = self.model.config.d_state
+        d_conv = self.model.config.d_conv
+        # Inner takes the expand multiplication
+        d_inner = self.model.config.d_inner
+
+        # Important seqlen_offset to go through the update mecanism with the state
+        seqlen_offset = 1
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=seqlen,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=self.device,
+            dtype=self.dtype,
+        )
+
+        self.model.forward(input_ids=input_ids, inference_params=inference_params)
+
+    def forward(
+        self, input_ids: torch.Tensor, inference_params: Any
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        bs = input_ids.shape[0]
+        padded_bs = bs
+        if bs == 3:
+            padded_bs = 4
+        elif 3 < bs <= 8:
+            padded_bs = 8
+        elif bs > 8:
+            padded_bs = (bs + 7) // 8 * 8
+
+        # Try to find an associated cuda graph
+        cuda_graph = self.cuda_graphs.get(padded_bs, None)
+        is_prefill = inference_params is None or inference_params.seqlen_offset == 0
+
+        if is_prefill or cuda_graph is None:
+            return self.model(
+                input_ids,
+                inference_params=inference_params,
+            )
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][:bs] = input_ids
+        cuda_graph["inference_params"].conv_states[
+            :, :bs
+        ] = inference_params.conv_states
+        cuda_graph["inference_params"].ssm_states[:, :bs] = inference_params.ssm_states
+
+        # Replay the graph
+        cuda_graph["graph"].replay()
+
+        inference_params.conv_states.copy_(
+            cuda_graph["inference_params"].conv_states[:, :bs]
+        )
+        inference_params.ssm_states.copy_(
+            cuda_graph["inference_params"].ssm_states[:, :bs]
+        )
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
+
+    def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tuple[int, int]]:
+        start = time.time_ns()
+        input_ids = (
+            batch.input_ids
+        )  # batch.past_input_ids if batch.past_input_ids is not None else batch.input_ids
+
+        batch_size, max_seqlen = input_ids.shape
+        # Inference params
+
+        if batch.inference_params is None:
+            # 0 is important here
+            seqlen_offset = 0
+            n_blocks = len(self.model.blocks)
+            d_state = self.model.config.d_state
+            d_conv = self.model.config.d_conv
+            d_inner = self.model.config.d_inner
+            inference_params = new_inference_params(
+                n_blocks=n_blocks,
+                batch_size=batch_size,
+                d_state=d_state,
+                d_conv=d_conv,
+                d_inner=d_inner,
+                seqlen_offset=seqlen_offset,
+                device=self.device,
+                dtype=self.dtype,
+            )
+            batch.inference_params = inference_params
+
+        # Forward pass
+        logits, speculative_logits = self.forward(
+            input_ids, inference_params=batch.inference_params
+        )
+
+        # batch.inference_params = new_inference_params
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
+        # Speculation is not active for causal
+        accepted_ids = torch.ones_like(batch.input_ids)[:, 0]
+        batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
+            batch.top_n_tokens,
+            batch.top_n_tokens_tensor,
+            torch.log_softmax(logits[:, -1], -1),
+            accepted_ids,
+        )
+
+        start_decode = time.time_ns()
+
+        # Zipped iterator
+        iterator = zip(
+            batch.requests,
+            batch.input_lengths,
+            batch.prefix_offsets,
+            batch.read_offsets,
+            logits,
+            batch.next_token_choosers,
+            batch.stopping_criterias,
+            batch.all_input_ids,
+            batch.top_n_tokens,
+            batch_top_token_ids,
+            batch_top_token_logprobs,
+        )
+
+        # For each member of the batch
+        for i, (
+            request,
+            input_length,
+            prefix_offset,
+            read_offset,
+            logits,
+            next_token_chooser,
+            stopping_criteria,
+            all_input_ids,
+            top_n_tokens,
+            top_token_ids,
+            top_token_logprobs,
+        ) in enumerate(iterator):
+            # Select next token
+            next_token_id, logprobs = next_token_chooser(
+                all_input_ids.view(1, -1), logits[-1:, :]
+            )
+
+            # Append next token to all tokens
+            all_input_ids = torch.cat([all_input_ids, next_token_id])
+            new_input_length = input_length + 1
+
+            # Generated token
+            next_token_logprob = logprobs[-1, next_token_id]
+            next_token_id_squeezed = next_token_id.squeeze()
+            next_token_text, prefix_offset, read_offset = self.decode_token(
+                all_input_ids[:, 0], prefix_offset, read_offset
+            )
+
+            # Evaluate stopping criteria
+            stop, reason = stopping_criteria(
+                next_token_id_squeezed,
+                next_token_text,
+            )
+
+            if not stop:
+                stopped = False
+
+            # Shard generations
+            # All generations will be appended in the rust sharded client
+            if i % self.world_size == self.rank:
+                if stop:
+                    # Decode generated tokens
+                    output_text, _, _ = self.decode_token(
+                        all_input_ids[:, 0],
+                        prefix_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens
+                        - 1,
+                        read_offset=len(all_input_ids)
+                        - stopping_criteria.current_tokens,
+                        skip_special_tokens=True,
+                    )
+                    # Get seed
+                    if isinstance(next_token_chooser.choice, Sampling):
+                        seed = next_token_chooser.choice.seed
+                    else:
+                        seed = None
+
+                    generated_text = GeneratedText(
+                        output_text, stopping_criteria.current_tokens, reason, seed
+                    )
+                else:
+                    generated_text = None
+
+                if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
+                    # Remove generated token to only have prefill and add nan for first prompt token
+                    prefill_logprobs = [float("nan")] + torch.log_softmax(
+                        logits, -1
+                    ).gather(1, all_input_ids[1:]).squeeze(1)[
+                        -new_input_length:-1
+                    ].tolist()
+                    prefill_token_ids = all_input_ids[-new_input_length:-1]
+                    prefill_texts = self.tokenizer.batch_decode(
+                        prefill_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    prefill_tokens = Tokens(
+                        prefill_token_ids,
+                        prefill_logprobs,
+                        prefill_texts,
+                        is_special=[],
+                    )
+                else:
+                    prefill_tokens = None
+
+                if top_n_tokens > 0:
+                    toptoken_texts = self.tokenizer.batch_decode(
+                        top_token_ids,
+                        clean_up_tokenization_spaces=False,
+                        skip_special_tokens=False,
+                    )
+                    special_toptokens = [
+                        token_id in self.all_special_ids for token_id in top_token_ids
+                    ]
+                    top_tokens = Tokens(
+                        top_token_ids,
+                        top_token_logprobs,
+                        toptoken_texts,
+                        special_toptokens,
+                    )
+                else:
+                    top_tokens = None
+
+                generation = Generation(
+                    request.id,
+                    prefill_tokens,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
+                    generated_text,
+                    top_tokens,
+                )
+
+                generations.append(generation)
+
+                # Update values
+                batch.next_token_choosers[i] = batch.next_token_choosers[
+                    i
+                ].advance_grammar(next_token_id_squeezed.item())
+                batch.input_ids[i, 0] = next_token_id
+                batch.all_input_ids[i] = all_input_ids
+                batch.input_lengths[i] = new_input_length
+                batch.prefix_offsets[i] = prefix_offset
+                batch.read_offsets[i] = read_offset
+                batch.max_input_length = max(batch.max_input_length, new_input_length)
+
+        # We finished all generations in the batch; there is no next batch
+        if stopped:
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
+
+        # Slice unused values from prefill
+        batch.input_ids = batch.input_ids[:, :1]
+
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 806e9833..09130b85 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -2,11 +2,24 @@ import inspect
 import torch
 
 from abc import ABC, abstractmethod
-from typing import List, Tuple, Optional, TypeVar, Type
+from typing import List, Tuple, Optional, TypeVar, Type, Dict, DefaultDict
+from collections import defaultdict
 from transformers import PreTrainedTokenizerBase, PretrainedConfig
 
 from text_generation_server.models.types import Batch, Generation
+from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.pb.generate_pb2 import InfoResponse
+from text_generation_server.adapters.weights import LayerAdapterWeights
+from text_generation_server.utils.adapter import (
+    load_and_merge_adapters,
+    AdapterParameters,
+    AdapterSource,
+)
+from loguru import logger
+
+
+BASE_MODEL_ADAPTER_ID = "__base_model__"
+
 
 B = TypeVar("B", bound=Batch)
 
@@ -14,6 +27,7 @@ B = TypeVar("B", bound=Batch)
 class Model(ABC):
     def __init__(
         self,
+        model_id: str,
         model: torch.nn.Module,
         tokenizer: PreTrainedTokenizerBase,
         requires_padding: bool,
@@ -21,15 +35,38 @@ class Model(ABC):
         device: torch.device,
         rank: int = 0,
         world_size: int = 1,
+        sliding_window: Optional[int] = None,
+        speculate: Optional[int] = None,
+        adapter_id: str = BASE_MODEL_ADAPTER_ID,
     ):
+        self.model_id = model_id
         self.model = model.eval()
         self.tokenizer = tokenizer
+
+        # all_special_ids is not set correctly if the rust tokenizer is unpacked
+        # TODO report this to transformers.
+        other_special_ids = {
+            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
+        }
         self.all_special_ids = set(tokenizer.all_special_ids)
+        self.all_special_ids.update(other_special_ids)
         self.requires_padding = requires_padding
         self.dtype = dtype
         self.device = device
         self.rank = rank
         self.world_size = world_size
+        self.sliding_window = sliding_window if sliding_window != -1 else None
+
+        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
+            LayerAdapterWeights
+        )
+        self.target_to_layer = None
+        self.loaded_adapters = set()
+        self.static_adapter_id = adapter_id
+
+        if speculate is None:
+            speculate = get_speculate()
+        self.speculate = speculate
 
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
@@ -40,10 +77,15 @@ class Model(ABC):
 
     @property
     def info(self) -> InfoResponse:
+        if self.requires_padding and self.sliding_window is not None:
+            raise NotImplementedError("sliding_window is not implemented with padding")
+
         return InfoResponse(
             requires_padding=self.requires_padding,
             dtype=str(self.dtype),
             device_type=self.device.type,
+            window_size=self.sliding_window,
+            speculate=self.speculate,
         )
 
     @property
@@ -52,7 +94,9 @@ class Model(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def generate_token(self, batch: B) -> Tuple[List[Generation], Optional[B]]:
+    def generate_token(
+        self, batch: B
+    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
         raise NotImplementedError
 
     def warmup(self, batch: B) -> Optional[int]:
@@ -64,16 +108,18 @@ class Model(ABC):
         all_input_ids: List[int],
         prefix_offset: int = 0,
         read_offset: int = 0,
+        skip_special_tokens: bool = False,
     ) -> Tuple[str, int, int]:
         """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
 
         # The prefix text is necessary only to defeat cleanup algorithms in the decode
         # which decide to add a space or not depending on the surrounding ids.
         prefix_text = self.tokenizer.decode(
-            all_input_ids[prefix_offset:read_offset], skip_special_tokens=False
+            all_input_ids[prefix_offset:read_offset],
+            skip_special_tokens=skip_special_tokens,
         )
         new_text = self.tokenizer.decode(
-            all_input_ids[prefix_offset:], skip_special_tokens=False
+            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
         )
 
         if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
@@ -95,3 +141,138 @@ class Model(ABC):
             raise RuntimeError(
                 f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
             )
+
+    @property
+    def supports_adapter_loading(self) -> bool:
+        return False
+
+    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
+        return {}
+
+    @property
+    def adapter_layers(self) -> List[str]:
+        return []
+
+    @property
+    def default_traced_adapter_layers(self) -> List[str]:
+        return []
+
+    def get_num_layers_for_type(self, layer_type: str) -> int:
+        return 0
+
+    def is_row_parallel(self, layer_type: str) -> bool:
+        return False
+
+    @property
+    def max_speculative_tokens(self) -> int:
+        return max(
+            [
+                weights.max_speculative_tokens
+                for weights in self.layer_to_adapter_weights.values()
+            ],
+            default=0,
+        )
+
+    def load_adapter(
+        self,
+        adapter_parameters: AdapterParameters,
+        adapter_source: AdapterSource,
+        adapter_index: int,
+        api_token: str,
+        dynamic: bool = True,
+    ):
+        """Loads adapter weights from disk / host memory on the GPU.
+
+        adapter_id must be `BASE_MODEL_ADAPTER_ID` if adapter statically loaded
+        into model. Otherwise, the adapter weights are applied during the forward
+        pass and stored separately from the base model parameters.
+        """
+        if self.target_to_layer is None:
+            self.target_to_layer = self.adapter_target_to_layer()
+        if adapter_index in self.loaded_adapters:
+            # Adapter already loaded
+            return
+
+        if not self.supports_adapter_loading:
+            raise ValueError("This model does not support adapter loading.")
+
+        if dynamic and not self.dynamic_adapter_loading_enabled:
+            raise ValueError(
+                f"This model was initialized with the adapter {self.static_adapter_id} "
+                f"and therefore does not support dynamic adapter loading. "
+                f"Please initialize a new model instance from the base model in "
+                f"order to use the dynamic adapter loading feature."
+            )
+
+        logger.info(
+            f"Loading adapter weights into model: {','.join(adapter_parameters.adapter_ids)}"
+        )
+        weight_names = tuple([v[0] for v in self.target_to_layer.values()])
+        (
+            module_map,
+            adapter_config,
+            adapter_weight_names,
+            adapter_tokenizer,
+        ) = load_and_merge_adapters(
+            self.model_id,
+            adapter_parameters,
+            adapter_source,
+            adapter_index,
+            weight_names,
+            api_token,
+            False,
+        )
+
+        unused_weight_names = adapter_weight_names.copy()
+        for layer_name in self.adapter_layers:
+            adapter_weights = adapter_config.load_batched_adapter_weights(
+                self,
+                module_map,
+                layer_name,
+                unused_weight_names,
+                dynamic,
+            )
+
+            if adapter_weights is None:
+                continue
+
+            layer_weights = self.layer_to_adapter_weights[layer_name]
+            layer_weights.add_adapter(adapter_index, adapter_weights)
+
+        if len(unused_weight_names) > 0:
+            logger.warning(
+                f"{','.join(adapter_parameters.adapter_ids)} unused adapter weights: {unused_weight_names}"
+            )
+
+        if adapter_tokenizer is not None:
+            self.tokenizers.add_tokenizer(adapter_index, adapter_tokenizer)
+
+        self.loaded_adapters.add(adapter_index)
+
+    def offload_adapter(
+        self,
+        adapter_parameters: AdapterParameters,
+        adapter_source: AdapterSource,
+        adapter_index: int,
+    ):
+        """Offloads the adapter weights from GPU to CPU or disk."""
+        if adapter_index not in self.loaded_adapters:
+            # Adapter already offloaded
+            return
+
+        if not self.supports_adapter_loading:
+            raise ValueError("This model does not support adapter loading.")
+
+        if not self.dynamic_adapter_loading_enabled:
+            raise ValueError(
+                f"This model was initialized with the adapter {self.static_adapter_id} "
+                f"and therefore does not support dynamic adapter loading. "
+                f"Please initialize a new model instance from the base model in "
+                f"order to use the dynamic adapter loading feature."
+            )
+
+        for layer_name in self.adapter_layers:
+            if layer_name in self.layer_to_adapter_weights:
+                self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index)
+
+        self.loaded_adapters.remove(adapter_index)
diff --git a/server/text_generation_server/models/mpt.py b/server/text_generation_server/models/mpt.py
deleted file mode 100644
index 909d9852..00000000
--- a/server/text_generation_server/models/mpt.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import torch
-import torch.distributed
-
-from pathlib import Path
-from typing import Optional, Type
-from opentelemetry import trace
-from transformers import AutoTokenizer, PretrainedConfig, PreTrainedTokenizerBase
-from huggingface_hub import hf_hub_download
-import json
-
-from text_generation_server.models import CausalLM
-from text_generation_server.models.causal_lm import CausalLMBatch
-from text_generation_server.pb import generate_pb2
-from text_generation_server.models.custom_modeling.mpt_modeling import (
-    MPTForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-tracer = trace.get_tracer(__name__)
-
-
-class MPTCausalLMBatch(CausalLMBatch):
-    @classmethod
-    def from_pb(
-        cls,
-        pb: generate_pb2.Batch,
-        tokenizer: PreTrainedTokenizerBase,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> "CausalLMBatch":
-        batch = super().from_pb(pb=pb, tokenizer=tokenizer, dtype=dtype, device=device)
-        batch.keys_head_dim_last = False
-        return batch
-
-
-class MPTSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16
-        else:
-            raise NotImplementedError("MPTSharded is only available on GPU")
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-
-        # If model_id is a local path, load the file directly
-        local_path = Path(model_id, "config.json")
-        if local_path.exists():
-            filename = str(local_path.resolve())
-        else:
-            filename = hf_hub_download(
-                model_id, revision=revision, filename="config.json"
-            )
-        with open(filename, "r") as f:
-            config = json.load(f)
-        config = PretrainedConfig(**config)
-        config.quantize = quantize
-
-        torch.distributed.barrier(group=self.process_group)
-
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        config.quantize = quantize
-        model = MPTForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=False,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    @property
-    def batch_type(self) -> Type[CausalLMBatch]:
-        return MPTCausalLMBatch
diff --git a/server/text_generation_server/models/opt.py b/server/text_generation_server/models/opt.py
deleted file mode 100644
index f3a23d07..00000000
--- a/server/text_generation_server/models/opt.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import Optional
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-)
-from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
-from text_generation_server.models import CausalLM
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class OPTSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-        tokenizer.pad_token_id = config.pad_token_id
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group
-        )
-        if config.quantize == "gptq":
-            weights._set_gptq_params(model_id)
-
-        model = OPTForCausalLM(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ):
-        outputs = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        return outputs.logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/pali_gemma.py b/server/text_generation_server/models/pali_gemma.py
new file mode 100644
index 00000000..3994ac70
--- /dev/null
+++ b/server/text_generation_server/models/pali_gemma.py
@@ -0,0 +1,76 @@
+from io import BytesIO
+from PIL import Image
+import torch
+import torch.distributed
+from opentelemetry import trace
+from typing import Iterable, Optional, Tuple
+from text_generation_server.models.vlm_causal_lm import (
+    VlmCausalLM,
+    VlmCausalLMBatch,
+    image_text_replacement,
+)
+from text_generation_server.models.custom_modeling.flash_pali_gemma_modeling import (
+    PaliGemmaForConditionalGeneration,
+)
+from transformers import AutoProcessor, AutoConfig
+
+from text_generation_server.pb.generate_pb2 import Request
+
+tracer = trace.get_tracer(__name__)
+
+
+class PaliGemmaBatch(VlmCausalLMBatch):
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[Request], tokenizer, processor, config
+    ):
+        batch_inputs = []
+        image_inputs = []
+        max_truncation = 0
+        for r in requests:
+            full_text = ""
+            image_id = 0
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    full_text += "<bos>" + chunk.text + "\n"
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    # TODO do_convert_RGB should be on by default ?
+                    image = image.convert("RGB")
+                    image_input = processor.image_processor(image, return_tensors="pt")
+                    full_text += image_text_replacement(
+                        processor, image_input, config, image_id
+                    )
+                    image_inputs.append(image_input)
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+
+            batch_inputs.append(full_text)
+            max_truncation = max(max_truncation, r.truncate)
+
+        batch_tokenized_inputs = tokenizer(
+            batch_inputs,
+            truncation=True,
+            max_length=max_truncation,
+            add_special_tokens=False,
+        )["input_ids"]
+        if image_inputs:
+            image_input = image_inputs[0]
+            new_image_inputs = {
+                "pixel_values": torch.cat(
+                    [img["pixel_values"] for img in image_inputs], dim=0
+                ),
+            }
+            if "pixel_attention_mask" in image_input:
+                new_image_inputs["pixel_attention_mask"] = torch.cat(
+                    [img["pixel_attention_mask"] for img in image_inputs], dim=0
+                )
+            if "image_sizes" in image_input:
+                new_image_inputs["image_sizes"] = torch.cat(
+                    [img["image_sizes"] for img in image_inputs], dim=0
+                )
+            image_inputs = new_image_inputs
+        else:
+            image_inputs = None
+        return batch_tokenized_inputs, image_inputs
diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py
deleted file mode 100644
index d97c1c73..00000000
--- a/server/text_generation_server/models/rw.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from typing import List, Optional, Tuple
-
-from text_generation_server.models import CausalLM
-
-
-class RW(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            if quantize:
-                raise ValueError("quantization is not available on CPU")
-
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            revision=revision,
-            torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
-            load_in_8bit=quantize == "bitsandbytes",
-            trust_remote_code=trust_remote_code,
-        )
-        if torch.cuda.is_available() and torch.cuda.device_count() == 1:
-            model = model.cuda()
-
-        if tokenizer.pad_token_id is None:
-            if model.config.pad_token_id is not None:
-                tokenizer.pad_token_id = model.config.pad_token_id
-            elif model.config.eos_token_id is not None:
-                tokenizer.pad_token_id = model.config.eos_token_id
-            elif tokenizer.eos_token_id is not None:
-                tokenizer.pad_token_id = tokenizer.eos_token_id
-            else:
-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
-        # Model Forward
-        outputs = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-        )
-        return outputs.logits, outputs.past_key_values
diff --git a/server/text_generation_server/models/santacoder.py b/server/text_generation_server/models/santacoder.py
deleted file mode 100644
index 81928c1d..00000000
--- a/server/text_generation_server/models/santacoder.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import Optional, List
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-from text_generation_server.models import CausalLM
-
-FIM_PREFIX = "<fim-prefix>"
-FIM_MIDDLE = "<fim-middle>"
-FIM_SUFFIX = "<fim-suffix>"
-FIM_PAD = "<fim-pad>"
-EOD = "<|endoftext|>"
-
-
-class SantaCoder(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            if quantize:
-                raise ValueError("quantization is not available on CPU")
-
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.add_special_tokens(
-            {
-                "additional_special_tokens": [
-                    EOD,
-                    FIM_PREFIX,
-                    FIM_MIDDLE,
-                    FIM_SUFFIX,
-                    FIM_PAD,
-                ],
-                "pad_token": EOD,
-            }
-        )
-        with device:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                revision=revision,
-                torch_dtype=dtype,
-                load_in_8bit=quantize == "bitsandbytes",
-                trust_remote_code=trust_remote_code,
-            )
-
-        super(CausalLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-        )
-
-    def decode(self, generated_ids: List[int]) -> str:
-        # Do not skip special tokens as they are used for custom parsing rules of the generated text
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
-        )
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 361453fb..dbaf1253 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -1,18 +1,30 @@
-from text_generation_server.utils.tokens import batch_top_tokens
 import torch
+import torch.distributed
+import time
 
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PreTrainedTokenizerBase
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    PreTrainedTokenizerBase,
+    AutoConfig,
+)
 from typing import Optional, Tuple, List, Type, Dict
 
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
+from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.models import Model
 from text_generation_server.models.types import (
     GeneratedText,
     Batch,
     Generation,
-    PrefillTokens,
-    TopTokens,
+    Tokens,
 )
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
@@ -93,10 +105,12 @@ class Seq2SeqLMBatch(Batch):
         padding_right_offset = 0
         max_decode_tokens = 0
         for i, r in enumerate(pb.requests):
-            inputs.append(r.inputs)
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
             requests_idx_mapping[r.id] = i
             decoder_input_lengths.append(1)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
@@ -351,9 +365,9 @@ class Seq2SeqLMBatch(Batch):
                     (total_batch_size, max_input_length),
                 )
             # Copy to correct indices
-            attention_mask[
-                start_index:end_index, -batch.max_input_length :
-            ] = batch.attention_mask[:, -batch.max_input_length :]
+            attention_mask[start_index:end_index, -batch.max_input_length :] = (
+                batch.attention_mask[:, -batch.max_input_length :]
+            )
 
             # Create padded tensor
             if decoder_input_ids is None:
@@ -528,11 +542,89 @@ class Seq2SeqLM(Model):
     def __init__(
         self,
         model_id: str,
+        model_class,
         revision: Optional[str] = None,
         quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        config_class=AutoConfig,
+        tokenizer_class=AutoTokenizer,
+        aliases=None,
+    ):
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+
+        config = config_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        tokenizer.bos_token_id = config.decoder_start_token_id
+
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            aliases=aliases,
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+
+        model = model_class(config, weights)
+
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")
+
         if torch.cuda.is_available():
             device = torch.device("cuda")
             dtype = torch.float16 if dtype is None else dtype
@@ -541,15 +633,17 @@ class Seq2SeqLM(Model):
                 raise ValueError("quantization is not available on CPU")
 
             device = torch.device("cpu")
-            dtype = torch.float32
+            dtype = torch.float32 if dtype is None else dtype
 
         model = AutoModelForSeq2SeqLM.from_pretrained(
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
@@ -565,23 +659,24 @@ class Seq2SeqLM(Model):
         )
         tokenizer.bos_token_id = model.config.decoder_start_token_id
 
-        super(Seq2SeqLM, self).__init__(
+        self = cls.__new__(
+            cls,
+        )
+        super().__init__(
+            self,
+            model_id=model_id,
             model=model,
             tokenizer=tokenizer,
             requires_padding=True,
             dtype=dtype,
             device=device,
         )
+        return self
 
     @property
     def batch_type(self) -> Type[Seq2SeqLMBatch]:
         return Seq2SeqLMBatch
 
-    def decode(self, decoder_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            decoder_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
     def forward(
         self,
         input_ids,
@@ -592,6 +687,7 @@ class Seq2SeqLM(Model):
         past_key_values: Optional = None,
     ) -> Tuple[
         torch.Tensor,
+        Optional[torch.Tensor],
         torch.Tensor,
         List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
     ]:
@@ -605,8 +701,15 @@ class Seq2SeqLM(Model):
             past_key_values=past_key_values,
             use_cache=True,
         )
+        if isinstance(outputs, tuple):
+            # Our custom models
+            outputs, speculative_logits = outputs
+        else:
+            # Generic transformers models
+            speculative_logits = None
         return (
             outputs.logits,
+            speculative_logits,
             outputs.encoder_last_hidden_state,
             outputs.past_key_values,
         )
@@ -614,7 +717,8 @@ class Seq2SeqLM(Model):
     @tracer.start_as_current_span("generate_token")
     def generate_token(
         self, batch: Seq2SeqLMBatch
-    ) -> Tuple[List[Generation], Optional[Seq2SeqLMBatch]]:
+    ) -> Tuple[List[Generation], Optional[Seq2SeqLMBatch], Tuple[int, int]]:
+        start = time.time_ns()
         if batch.decoder_attention_mask is not None:
             # slice to the correct shape
             decoder_attention_mask = batch.decoder_attention_mask[
@@ -630,7 +734,7 @@ class Seq2SeqLM(Model):
         else:
             encoder_last_hidden_state = None
 
-        logits, encoder_last_hidden_state, past = self.forward(
+        logits, speculative_logits, encoder_last_hidden_state, past = self.forward(
             batch.input_ids,
             batch.attention_mask,
             batch.decoder_input_ids,
@@ -639,12 +743,17 @@ class Seq2SeqLM(Model):
             batch.past_key_values,
         )
 
+        # Speculation is not active for seq2seq
+        accepted_ids = torch.ones_like(batch.decoder_input_ids)[:, 0]
         batch_top_token_ids, batch_top_token_logprobs = batch_top_tokens(
             batch.top_n_tokens,
             batch.top_n_tokens_tensor,
-            torch.softmax(logits[:, -1], -1),
+            torch.log_softmax(logits[:, -1], -1),
+            accepted_ids,
         )
 
+        start_decode = time.time_ns()
+
         # Finished requests
         generations: List[Generation] = []
         stopped = True
@@ -710,8 +819,13 @@ class Seq2SeqLM(Model):
                 if stop:
                     # Slice with decoder_input_length to remove padding
                     # Decode all tokens
-                    output_text = self.decode(
-                        all_decoder_input_ids[-decoder_input_length:]
+                    output_text, _, _ = self.decode_token(
+                        all_decoder_input_ids,
+                        prefix_offset=len(all_decoder_input_ids)
+                        - decoder_input_length
+                        - 1,
+                        read_offset=len(all_decoder_input_ids) - decoder_input_length,
+                        skip_special_tokens=True,
                     )
 
                     # Get seed
@@ -728,39 +842,49 @@ class Seq2SeqLM(Model):
 
                 # Prefill
                 if stopping_criteria.current_tokens == 1 and request.prefill_logprobs:
-                    prefill_tokens = PrefillTokens(
+                    prefill_tokens = Tokens(
                         [self.tokenizer.bos_token_id],
                         [float("nan")],
                         [self.tokenizer.bos_token],
+                        [False],
                     )
                 else:
                     prefill_tokens = None
 
                 if top_n_tokens > 0:
-                    toptoken_texts = self.tokenizer.batch_decode(
-                        top_token_ids,
-                        clean_up_tokenization_spaces=False,
-                        skip_special_tokens=False,
-                    )
-                    special_toptokens = [
-                        token_id in self.all_special_ids for token_id in top_token_ids
-                    ]
-                    top_tokens = TopTokens(
-                        top_token_ids,
-                        top_token_logprobs,
-                        toptoken_texts,
-                        special_toptokens,
-                    )
+                    all_top_tokens = []
+                    for top_token_ids, top_token_logprobs in zip(
+                        top_token_ids, top_token_logprobs
+                    ):
+                        toptoken_texts = self.tokenizer.batch_decode(
+                            top_token_ids,
+                            clean_up_tokenization_spaces=False,
+                            skip_special_tokens=False,
+                        )
+                        special_toptokens = [
+                            token_id in self.all_special_ids
+                            for token_id in top_token_ids
+                        ]
+                        top_tokens = Tokens(
+                            top_token_ids,
+                            top_token_logprobs,
+                            toptoken_texts,
+                            special_toptokens,
+                        )
+                        all_top_tokens.append(top_tokens)
+                    top_tokens = all_top_tokens
                 else:
                     top_tokens = None
 
                 generation = Generation(
                     request.id,
                     prefill_tokens,
-                    next_token_id_squeezed,
-                    next_token_logprob,
-                    next_token_text,
-                    next_token_id_squeezed.item() in self.all_special_ids,
+                    Tokens(
+                        [next_token_id_squeezed],
+                        [next_token_logprob],
+                        [next_token_text],
+                        [next_token_id_squeezed.item() in self.all_special_ids],
+                    ),
                     generated_text,
                     top_tokens,
                 )
@@ -768,6 +892,9 @@ class Seq2SeqLM(Model):
                 generations.append(generation)
 
             # Update values
+            batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
+                next_token_id_squeezed.item()
+            )
             batch.decoder_input_ids[i] = next_token_id
             batch.all_decoder_input_ids[i] = all_decoder_input_ids
             batch.input_lengths[i] = input_length
@@ -781,7 +908,9 @@ class Seq2SeqLM(Model):
 
         # We finished all generations in the batch; there is no next batch
         if stopped:
-            return generations, None
+            forward_ns = start_decode - start
+            decode_ns = time.time_ns() - start_decode
+            return generations, None, (forward_ns, decode_ns)
 
         # We don't need input_ids after the prefill forward
         batch.input_ids = None
@@ -792,4 +921,6 @@ class Seq2SeqLM(Model):
             batch.decoder_attention_mask[:, -batch.padding_right_offset] = 1
         batch.padding_right_offset -= 1
 
-        return generations, batch
+        forward_ns = start_decode - start
+        decode_ns = time.time_ns() - start_decode
+        return generations, batch, (forward_ns, decode_ns)
diff --git a/server/text_generation_server/models/t5.py b/server/text_generation_server/models/t5.py
deleted file mode 100644
index 133aafd8..00000000
--- a/server/text_generation_server/models/t5.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import torch
-import torch.distributed
-
-from typing import List, Optional, Tuple
-
-from transformers import (
-    AutoTokenizer,
-    AutoConfig,
-)
-
-from text_generation_server.models import Seq2SeqLM
-from text_generation_server.models.custom_modeling.t5_modeling import (
-    T5ForConditionalGeneration,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-
-
-class T5Sharded(Seq2SeqLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32
-
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-        )
-        config.quantize = quantize
-
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        tokenizer.bos_token_id = config.decoder_start_token_id
-
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-            aliases={
-                "shared.weight": [
-                    "encoder.embed_tokens.weight",
-                    "decoder.embed_tokens.weight",
-                ]
-            },
-        )
-
-        model = T5ForConditionalGeneration(config, weights)
-
-        torch.distributed.barrier(group=self.process_group)
-        super(Seq2SeqLM, self).__init__(
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
-
-    def forward(
-        self,
-        input_ids,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask: Optional,
-        encoder_last_hidden_state: Optional,
-        past_key_values: Optional = None,
-    ) -> Tuple[
-        torch.Tensor,
-        torch.Tensor,
-        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
-    ]:
-        # Model Forward
-        outputs = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_last_hidden_state,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-
-        return (
-            outputs.logits,
-            outputs.encoder_last_hidden_state,
-            outputs.past_key_values,
-        )
diff --git a/server/text_generation_server/models/types.py b/server/text_generation_server/models/types.py
index 0e27680d..339b733b 100644
--- a/server/text_generation_server/models/types.py
+++ b/server/text_generation_server/models/types.py
@@ -58,29 +58,14 @@ class GeneratedText:
 
 
 @dataclass
-class PrefillTokens:
-    token_ids: List[int]
-    logprobs: List[float]
-    texts: List[str]
-
-    def to_pb(self) -> generate_pb2.PrefillTokens:
-        return generate_pb2.PrefillTokens(
-            ids=self.token_ids, logprobs=self.logprobs, texts=self.texts
-        )
-
-    def __len__(self):
-        return len(self.token_ids)
-
-
-@dataclass
-class TopTokens:
+class Tokens:
     token_ids: List[int]
     logprobs: List[float]
     texts: List[str]
     is_special: List[bool]
 
-    def to_pb(self) -> generate_pb2.TopTokens:
-        return generate_pb2.TopTokens(
+    def to_pb(self) -> generate_pb2.Tokens:
+        return generate_pb2.Tokens(
             ids=self.token_ids,
             logprobs=self.logprobs,
             texts=self.texts,
@@ -94,27 +79,25 @@ class TopTokens:
 @dataclass
 class Generation:
     request_id: int
-    prefill_tokens: Optional[PrefillTokens]
-    token_id: int
-    token_logprob: float
-    token_text: str
-    token_is_special: bool
+    prefill_tokens: Optional[Tokens]
+    tokens: Tokens
     generated_text: Optional[GeneratedText]
     # Optional for now, since it's not yet supported for every model.
-    top_tokens: Optional[TopTokens]
+    top_tokens: Optional[List[Tokens]]
 
     def to_pb(self) -> generate_pb2.Generation:
         return generate_pb2.Generation(
             request_id=self.request_id,
-            prefill_tokens=self.prefill_tokens.to_pb()
-            if self.prefill_tokens is not None
-            else None,
-            token_id=self.token_id,
-            token_logprob=self.token_logprob,
-            token_text=self.token_text,
-            token_is_special=self.token_is_special,
-            generated_text=self.generated_text.to_pb()
-            if self.generated_text is not None
-            else None,
-            top_tokens=self.top_tokens.to_pb() if self.top_tokens is not None else None,
+            prefill_tokens=(
+                self.prefill_tokens.to_pb() if self.prefill_tokens is not None else None
+            ),
+            tokens=self.tokens.to_pb(),
+            generated_text=(
+                self.generated_text.to_pb() if self.generated_text is not None else None
+            ),
+            top_tokens=(
+                [top_tokens.to_pb() for top_tokens in self.top_tokens]
+                if self.top_tokens is not None
+                else None
+            ),
         )
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
new file mode 100644
index 00000000..ace48805
--- /dev/null
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -0,0 +1,392 @@
+from itertools import repeat
+import torch
+from PIL import Image
+from io import BytesIO
+
+from opentelemetry import trace
+from typing import Iterable, Optional, Tuple, List, Type, Dict
+
+from transformers import PreTrainedTokenizerBase
+from transformers.image_processing_utils import select_best_resolution
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.flash_causal_lm import (
+    FlashCausalLMBatch,
+    FlashCausalLM,
+)
+from transformers import AutoProcessor
+
+tracer = trace.get_tracer(__name__)
+
+IDEFICS2_FAKE_TOKEN = "<fake_token_around_image>"
+IDEFICS2_IMAGE_TOKEN = "<image>"
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (height, width).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise ValueError("grid_pinpoints should be a list of tuples or lists")
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def image_text_replacement(processor, image_input, config, image_id: int) -> str:
+    if config.model_type == "idefics2":
+        image_seq_len = 64
+        image_str = f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_IMAGE_TOKEN * image_seq_len}{IDEFICS2_FAKE_TOKEN}"
+        if processor.image_processor.do_image_splitting:
+            image_str *= 5
+        return image_str
+    elif config.model_type == "llava_next":
+        height, width = image_input["image_sizes"][image_id]
+        num_features = get_number_of_features(height, width, config)
+        from loguru import logger
+
+        logger.info(
+            f"Found {num_features} features in image of resolution {height}x{width}"
+        )
+        return "<image>" * num_features
+
+    elif config.model_type == "paligemma":
+        return "<image>" * config.text_config.num_image_tokens
+    else:
+        raise RuntimeError(f"Unknown config {config.model_type} for multimodal")
+
+
+def image_text_replacement_fixup(config, text: str) -> str:
+    if config.model_type == "idefics2":
+        return text.replace(
+            f"{IDEFICS2_FAKE_TOKEN}{IDEFICS2_FAKE_TOKEN}", IDEFICS2_FAKE_TOKEN
+        )
+    return text
+
+
+def get_unpadded_features(
+    original_height: int,
+    original_width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = original_width / original_height
+    current_aspect_ratio: float = current_width / current_height
+
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (original_height * current_width) // original_width
+        padding = (current_height - new_height) // 2
+        current_height = current_height - (2 * padding)
+    else:
+        new_width = (original_width * current_height) // original_height
+        padding = (current_width - new_width) // 2
+        current_width = current_width - (2 * padding)
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def get_number_of_features(height: int, width: int, config) -> int:
+    # From config
+    # Hardcoded for CLIP for now
+    # image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+    image_grid_pinpoints = config.image_grid_pinpoints
+    image_size = config.vision_config.image_size
+    patch_size = config.vision_config.patch_size
+
+    assert image_size % patch_size == 0
+
+    npatches = image_size // patch_size
+
+    # Dimensions are intentionally swapped to be bug-compatible with
+    # upstream: https://github.com/LLaVA-VL/LLaVA-NeXT/issues/59
+    num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+        [height, width],
+        image_grid_pinpoints,
+        image_size,
+    )
+    unpadded_features, newline_features = get_unpadded_features(
+        height, width, npatches, num_patch_height, num_patch_width
+    )
+    # The base patch covers the entire image
+    base_features = npatches**2
+    return unpadded_features + newline_features + base_features
+
+
+class VlmCausalLMBatch(FlashCausalLMBatch):
+    pixel_values: Optional[List[torch.Tensor]]
+    pixel_attention_mask: Optional[List[torch.Tensor]]
+    image_sizes: Optional[List[Tuple[int, int]]]
+
+    @classmethod
+    @tracer.start_as_current_span("concatenate")
+    def concatenate(cls, batches):
+        batch = super(VlmCausalLMBatch, cls).concatenate(batches)
+        batch.pixel_values = None
+        batch.pixel_attention_mask = None
+        batch.image_sizes = None
+        return batch
+
+    @tracer.start_as_current_span("filter")
+    def filter(self, request_ids: List[int]):
+        batch = super().filter(request_ids)
+        batch.pixel_values = None
+        batch.pixel_attention_mask = None
+        batch.image_sizes = None
+        return batch
+
+    @classmethod
+    def batch_tokenized_inputs(
+        cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config
+    ):
+        # Process images first. We need all of them so that the processor
+        # can make the image splits the same size. And we need the final
+        # sizes to insert correct number of image tokens.
+        images = []
+        for r in requests:
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    pass
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    if config.model_type == "llava_next":
+                        images.append(image)
+                    else:
+                        images.append([image])
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+
+        if images:
+            image_inputs = processor.image_processor(images, return_tensors="pt")
+        else:
+            image_inputs = None
+
+        batch_inputs = []
+        max_truncation = 0
+        image_id = 0
+        for r in requests:
+            full_text = ""
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    full_text += chunk.text
+                elif chunk_type == "image":
+                    full_text += image_text_replacement(
+                        processor, image_inputs, config, image_id
+                    )
+                    image_id += 1
+
+            full_text = image_text_replacement_fixup(config, full_text)
+
+            batch_inputs.append(full_text)
+            max_truncation = max(max_truncation, r.truncate)
+
+        batch_tokenized_inputs = tokenizer(
+            batch_inputs,
+            truncation=True,
+            max_length=max_truncation,
+            add_special_tokens=not config.model_type == "paligemma",
+        )["input_ids"]
+
+        return batch_tokenized_inputs, image_inputs
+
+    @classmethod
+    def from_pb_processor(
+        cls,
+        pb: generate_pb2.Batch,
+        tokenizer: PreTrainedTokenizerBase,
+        processor,
+        config,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> "VlmCausalLMBatch":
+        batch_tokenized_inputs, image_inputs = cls.batch_tokenized_inputs(
+            pb.requests, tokenizer, processor, config
+        )
+        batch = cls.from_tokenized(pb, tokenizer, batch_tokenized_inputs, dtype, device)
+        if image_inputs is not None:
+            batch.pixel_values = image_inputs["pixel_values"].to(device=device)
+            if "pixel_attention_mask" in image_inputs:
+                batch.pixel_attention_mask = image_inputs["pixel_attention_mask"].to(
+                    device=device
+                )
+            else:
+                batch.pixel_attention_mask = None
+            if "image_sizes" in image_inputs:
+                batch.image_sizes = image_inputs["image_sizes"].to(device=device)
+            else:
+                batch.image_sizes = None
+        else:
+            batch.pixel_values = None
+            batch.pixel_attention_mask = None
+            batch.image_sizes = None
+        return batch
+
+
+class VlmCausalLM(FlashCausalLM):
+    def __init__(
+        self,
+        model_id: str,
+        *,
+        processor_class=AutoProcessor,
+        processor_kwargs=None,
+        batch_class=VlmCausalLMBatch,
+        revision,
+        trust_remote_code: bool,
+        **kwargs,
+    ):
+        if processor_kwargs is None:
+            processor_kwargs = {}
+        self.processor = processor_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            **processor_kwargs,
+        )
+        self.batch_class = batch_class
+        super().__init__(model_id=model_id, **kwargs)
+
+    @property
+    def batch_type(self) -> Type[VlmCausalLMBatch]:
+        return self.batch_class
+
+    def max_past(self) -> Optional[int]:
+        return getattr(self.model.text_model, "max_past", None)
+
+    def forward(
+        self,
+        batch: VlmCausalLMBatch,
+        adapter_data: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        # Model Forward
+        if batch.speculative_ids is not None:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+            speculative_ids = batch.speculative_ids
+
+            B, speculative_length = speculative_ids.shape
+            new_length = speculative_length + 1
+            new_input_ids = torch.cat(
+                [input_ids.unsqueeze(-1), speculative_ids], dim=1
+            ).reshape(-1)
+            arange = torch.arange(new_length, device=position_ids.device).unsqueeze(0)
+            arange_int = arange.to(dtype=torch.int32)
+            new_position_ids = (
+                position_ids.unsqueeze(-1).expand(B, new_length) + arange
+            ).view(-1)
+            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+            input_lengths = (
+                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+
+            # Add Copy the block tables for all members
+            block_tables = (
+                block_tables.unsqueeze(1)
+                .expand(B, new_length, -1)
+                .reshape(B * new_length, -1)
+                .contiguous()
+            )
+            max_s = max_s + speculative_length
+
+            input_ids = new_input_ids
+            position_ids = new_position_ids
+        else:
+            input_ids = batch.input_ids
+            position_ids = batch.position_ids
+            cu_seqlen_prefill = batch.cu_seqlen_prefill
+            kv_cache = self.kv_cache
+            block_tables = batch.block_tables_tensor
+            slots = batch.slots[batch.slot_indices]
+            input_lengths = batch.input_lengths_tensor
+            max_s = batch.max_seqlen
+            lm_head_indices = batch.prefill_head_indices
+
+        if cu_seqlen_prefill is None and self.max_past() is not None:
+            # In decode, not prefill, we're actually overwriting the KV-cache
+            # in a circular buffer mode.
+            # This makes sure the max_s for the decode pass is correct.
+            max_s = min(self.max_past(), max_s)
+
+        bs = input_ids.shape[0]
+        # Try to find an associated cuda graph
+        bs = input_ids.shape[0]
+        sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
+        if sorted_padded_bs:
+            # Get associated cuda graph
+            cuda_graph = self.cuda_graphs[sorted_padded_bs[0]]
+        else:
+            cuda_graph = None
+        if cu_seqlen_prefill is not None or cuda_graph is None:
+            logits, speculative_logits = self.model.forward(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                cu_seqlen_prefill=cu_seqlen_prefill,
+                kv_cache=kv_cache,
+                block_tables=block_tables,
+                slots=slots,
+                input_lengths=input_lengths,
+                max_s=max_s,
+                prefill_cache_indices=batch.prefill_cache_indices,
+                lm_head_indices=lm_head_indices,
+                pixel_values=batch.pixel_values,
+                pixel_attention_mask=batch.pixel_attention_mask,
+                image_sizes=batch.image_sizes,
+            )
+            if batch.prefill_cache_indices is not None:
+                batch.prefill_cache_indices = None
+            if batch.pixel_values is not None:
+                batch.pixel_values = None
+            if batch.pixel_attention_mask is not None:
+                batch.pixel_attention_mask = None
+            if batch.image_sizes is not None:
+                batch.image_sizes = None
+            return logits, speculative_logits
+
+        # Copy inputs to the static inputs of the cuda graph
+        # Static inputs are potentially padded
+        cuda_graph["input_ids"][: input_ids.shape[0]] = input_ids
+        cuda_graph["position_ids"][: position_ids.shape[0]] = position_ids
+        cuda_graph["block_tables"][
+            : block_tables.shape[0], : block_tables.shape[1]
+        ] = block_tables
+        cuda_graph["slots"].fill_(-1)
+        cuda_graph["slots"][: slots.shape[0]] = slots
+        cuda_graph["input_lengths"].zero_()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+
+        # Replay the graph
+        cuda_graph["graph"].replay()
+
+        # Slice output to the correct shape
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits
diff --git a/server/text_generation_server/pb/.gitignore b/server/text_generation_server/pb/.gitignore
index 2621a190..5a68d631 100644
--- a/server/text_generation_server/pb/.gitignore
+++ b/server/text_generation_server/pb/.gitignore
@@ -1,3 +1,3 @@
 *.py
 *.pyi
-*.py-e
\ No newline at end of file
+*.py-e
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 67137aaa..aee287c6 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -1,6 +1,8 @@
 import asyncio
 import os
 import torch
+import time
+import signal
 
 from grpc import aio
 from loguru import logger
@@ -12,21 +14,56 @@ from typing import List, Optional
 from text_generation_server.cache import Cache
 from text_generation_server.interceptor import ExceptionInterceptor
 from text_generation_server.models import Model, get_model
+
+try:
+    from text_generation_server.models.pali_gemma import PaliGemmaBatch
+    from text_generation_server.models.vlm_causal_lm import (
+        VlmCausalLMBatch,
+    )
+    from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
+
+    VLM_BATCH_TYPES = {PaliGemmaBatch, VlmCausalLMBatch, IdeficsCausalLMBatch}
+except (ImportError, NotImplementedError):
+    # These imports can fail on CPU/Non flash.
+    VLM_BATCH_TYPES = set()
+
 from text_generation_server.pb import generate_pb2_grpc, generate_pb2
 from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
-from text_generation_server.models.idefics_causal_lm import IdeficsCausalLMBatch
+from text_generation_server.models.globals import set_model_id, set_adapter_to_index
+from text_generation_server.utils.adapter import (
+    AdapterParameters,
+)
+
+
+class SignalHandler:
+    KEEP_PROCESSING = True
+
+    def __init__(self):
+        signal.signal(signal.SIGINT, self.exit_gracefully)
+        signal.signal(signal.SIGTERM, self.exit_gracefully)
+
+    def exit_gracefully(self, signum, frame):
+        print(f"Exiting gracefully: Signal {signum}")
+        self.KEEP_PROCESSING = False
+
 
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
-    def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
+    def __init__(
+        self,
+        model: Model,
+        cache: Cache,
+        quantize: Optional[str],
+        server_urls: List[str],
+    ):
         self.cache = cache
         self.model = model
+        self.quantize = quantize
         self.server_urls = server_urls
         # For some reason, inference_mode does not work well with GLOO which we use on CPU
         if model.device.type == "cuda":
             # Force inference mode for the lifetime of TextGenerationService
             self._inference_mode_raii_guard = torch._C._InferenceMode(True)
 
-
     async def Info(self, request, context):
         return self.model.info
 
@@ -55,9 +92,31 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
         return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
 
     async def Warmup(self, request, context):
-        if self.model.batch_type == IdeficsCausalLMBatch: #Hack, i would rather use kwargs in the `from_pb` call
-            batch = self.model.batch_type.from_pb(
-                request.batch, self.model.tokenizer, self.model.processor, self.model.dtype, self.model.device
+        if self.quantize in {"exl2", "gptq"}:
+            try:
+                # When using GPTQ, Exllama kernels need some global kernels
+                # For which we have the finale shapes only after the model has loaded
+                # This will allocate those buffers.
+                from text_generation_server.layers.gptq import (
+                    create_exllama_buffers,
+                    set_device,
+                )
+
+                set_device(self.model.device)
+                create_exllama_buffers(request.max_prefill_tokens)
+            except ImportError:
+                pass
+
+        if (
+            self.model.batch_type in VLM_BATCH_TYPES
+        ):  # Hack, i would rather use kwargs in the `from_pb` call
+            batch = self.model.batch_type.from_pb_processor(
+                request.batch,
+                self.model.tokenizer,
+                self.model.processor,
+                self.model.model.config,
+                self.model.dtype,
+                self.model.device,
             )
         else:
             batch = self.model.batch_type.from_pb(
@@ -70,24 +129,36 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
         )
 
     async def Prefill(self, request, context):
-        if self.model.batch_type == IdeficsCausalLMBatch: #Hack, i would rather use kwargs in the `from_pb` call
-            batch = self.model.batch_type.from_pb(
-                request.batch, self.model.tokenizer, self.model.processor, self.model.dtype, self.model.device
+        start = time.time_ns()
+        if (
+            self.model.batch_type in VLM_BATCH_TYPES
+        ):  # Hack, i would rather use kwargs in the `from_pb` call
+            batch = self.model.batch_type.from_pb_processor(
+                request.batch,
+                self.model.tokenizer,
+                self.model.processor,
+                self.model.model.config,
+                self.model.dtype,
+                self.model.device,
             )
         else:
             batch = self.model.batch_type.from_pb(
                 request.batch, self.model.tokenizer, self.model.dtype, self.model.device
             )
 
-        generations, next_batch = self.model.generate_token(batch)
+        generations, next_batch, timings = self.model.generate_token(batch)
         self.cache.set(next_batch)
 
         return generate_pb2.PrefillResponse(
             generations=[generation.to_pb() for generation in generations],
             batch=next_batch.to_pb() if next_batch else None,
+            forward_ns=timings[0],
+            decode_ns=timings[1],
+            total_ns=time.time_ns() - start,
         )
 
     async def Decode(self, request, context):
+        start = time.time_ns()
         if len(request.batches) == 0:
             raise ValueError("Must provide at least one batch")
 
@@ -102,37 +173,50 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
             raise ValueError("All batches are empty")
 
         if len(batches) > 1:
+            start_concat = time.time_ns()
             batch = self.model.batch_type.concatenate(batches)
+            concat_ns = time.time_ns() - start_concat
         else:
             batch = batches[0]
+            concat_ns = None
 
-        generations, next_batch = self.model.generate_token(batch)
+        generations, next_batch, timings = self.model.generate_token(batch)
         self.cache.set(next_batch)
 
         return generate_pb2.DecodeResponse(
             generations=[generation.to_pb() for generation in generations],
             batch=next_batch.to_pb() if next_batch else None,
+            concat_ns=concat_ns,
+            forward_ns=timings[0],
+            decode_ns=timings[1],
+            total_ns=time.time_ns() - start,
         )
 
 
 def serve(
     model_id: str,
+    lora_adapter_ids: Optional[List[str]],
     revision: Optional[str],
     sharded: bool,
     quantize: Optional[str],
+    speculate: Optional[int],
     dtype: Optional[str],
     trust_remote_code: bool,
     uds_path: Path,
+    max_input_tokens: int,
 ):
     async def serve_inner(
         model_id: str,
+        lora_adapter_ids: Optional[List[str]],
         revision: Optional[str],
         sharded: bool = False,
         quantize: Optional[str] = None,
+        speculate: Optional[int] = None,
         dtype: Optional[str] = None,
         trust_remote_code: bool = False,
     ):
         unix_socket_template = "unix://{}-{}"
+        adapter_to_index = {}
         if sharded:
             server_urls = [
                 unix_socket_template.format(uds_path, rank)
@@ -145,35 +229,55 @@ def serve(
 
         try:
             model = get_model(
-                model_id, revision, sharded, quantize, dtype, trust_remote_code
+                model_id,
+                lora_adapter_ids,
+                revision,
+                sharded,
+                quantize,
+                speculate,
+                dtype,
+                trust_remote_code,
+                max_input_tokens,
             )
+
+            if len(lora_adapter_ids) > 0:
+                for index, adapter_id in enumerate(lora_adapter_ids):
+                    # TODO: improve non merged adapter loading and long term
+                    # improve adapter loading as a whole
+                    adapter_parameters = AdapterParameters(
+                        adapter_ids=[adapter_id],
+                        weights=None,  #  will be set to 1
+                        merge_strategy=0,
+                        density=1.0,
+                        majority_sign_method=0,
+                    )
+                    adapter_index = index + 1
+                    adapter_to_index[adapter_id] = adapter_index
+                    model.load_adapter(
+                        adapter_parameters,
+                        None,  # adapter_source
+                        adapter_index,
+                        None,  # api_token
+                        False,  # dynamic
+                    )
+
         except Exception:
             logger.exception("Error when initializing model")
             raise
 
-        if quantize == "gptq":
-            try:
-                # When using GPTQ, Exllama kernels need some global kernels
-                # For which we have the finale shapes only after the model has loaded
-                # This will allocate those buffers.
-                from text_generation_server.utils.gptq.exllama import (
-                    create_exllama_buffers,
-                    set_device,
-                )
-
-                set_device(model.device)
-                create_exllama_buffers()
-            except ImportError:
-                pass
-
+        set_adapter_to_index(adapter_to_index)
         server = aio.server(
             interceptors=[
                 ExceptionInterceptor(),
                 UDSOpenTelemetryAioServerInterceptor(),
-            ]
+            ],
+            options=[
+                # Set the maximum possible message length: i32::MAX
+                ("grpc.max_receive_message_length", (1 << 31) - 1)
+            ],
         )
         generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
-            TextGenerationService(model, Cache(), server_urls), server
+            TextGenerationService(model, Cache(), quantize, server_urls), server
         )
         SERVICE_NAMES = (
             generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
@@ -185,13 +289,20 @@ def serve(
         await server.start()
 
         logger.info("Server started at {}".format(local_url))
+        signal_handler = SignalHandler()
+        while signal_handler.KEEP_PROCESSING:
+            await asyncio.sleep(0.5)
 
-        try:
-            await server.wait_for_termination()
-        except KeyboardInterrupt:
-            logger.info("Signal received. Shutting down")
-            await server.stop(0)
-
+    set_model_id(model_id)
     asyncio.run(
-        serve_inner(model_id, revision, sharded, quantize, dtype, trust_remote_code)
+        serve_inner(
+            model_id,
+            lora_adapter_ids,
+            revision,
+            sharded,
+            quantize,
+            speculate,
+            dtype,
+            trust_remote_code,
+        )
     )
diff --git a/server/text_generation_server/tracing.py b/server/text_generation_server/tracing.py
index bf03c379..bc7a04ee 100644
--- a/server/text_generation_server/tracing.py
+++ b/server/text_generation_server/tracing.py
@@ -54,10 +54,8 @@ class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
         )
 
 
-def setup_tracing(shard: int, otlp_endpoint: str):
-    resource = Resource.create(
-        attributes={"service.name": f"text-generation-inference.server-{shard}"}
-    )
+def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
+    resource = Resource.create(attributes={"service.name": otlp_service_name})
     span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
     span_processor = BatchSpanProcessor(span_exporter)
 
diff --git a/server/text_generation_server/utils/adapter.py b/server/text_generation_server/utils/adapter.py
new file mode 100644
index 00000000..4e2492de
--- /dev/null
+++ b/server/text_generation_server/utils/adapter.py
@@ -0,0 +1,196 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/utils/adapter.py
+# License:  Apache License Version 2.0, January 2004
+
+import warnings
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Set, Tuple
+
+from safetensors.torch import load_file
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils.merges.strategies import merge_adapters
+
+from text_generation_server.utils import hub
+from text_generation_server.adapters.lora import LoraConfig
+
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters.config import AdapterConfig, ModuleMap
+
+
+BASE_MODEL_ADAPTER_ID = "__base_model__"
+
+
+@dataclass
+class AdapterParameters:
+    adapter_ids: Tuple[str]
+    weights: Tuple[float]
+    merge_strategy: NotImplemented
+    density: float
+    majority_sign_method: NotImplemented
+
+
+@dataclass
+class AdapterSource:
+    adapter_id: str
+    model_id: str
+    revision: str
+
+
+def load_and_merge_adapters(
+    model_id: str,
+    adapter_parameters: AdapterParameters,
+    adapter_source: str,
+    adapter_index: int,
+    weight_names: Tuple[str],
+    api_token: str,
+    trust_remote_code: bool = False,
+) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
+    if len(adapter_parameters.adapter_ids) == 1:
+        return load_module_map(
+            model_id,
+            adapter_parameters.adapter_ids[0],
+            adapter_source,
+            weight_names,
+            api_token,
+            trust_remote_code,
+        )
+
+    adapter_params = AdapterParametersContainer(
+        adapter_parameters, adapter_source, adapter_index
+    )
+    return _load_and_merge(
+        model_id, adapter_params, weight_names, api_token, trust_remote_code
+    )
+
+
+@dataclass
+class AdapterParametersContainer:
+    adapter_parameters: AdapterParameters
+    adapter_source: str
+    adapter_index: int
+
+    def __hash__(self) -> int:
+        return self.adapter_index
+
+
+@lru_cache(maxsize=32)
+def _load_and_merge(
+    model_id: str,
+    adapter_params: AdapterParametersContainer,
+    weight_names: Tuple[str],
+    api_token: str,
+    trust_remote_code: bool = False,
+) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
+    params = adapter_params.adapter_parameters
+
+    adapters_to_merge = []
+    merged_weight_names = set()
+    tokenizer = None
+    for adapter_id in params.adapter_ids:
+        if adapter_id == BASE_MODEL_ADAPTER_ID:
+            raise ValueError("Base model adapter cannot be merged.")
+
+        module_map, adapter_config, adapter_weight_names, adapter_tokenizer = (
+            load_module_map(
+                model_id,
+                adapter_id,
+                adapter_params.adapter_source,
+                weight_names,
+                api_token,
+                trust_remote_code,
+            )
+        )
+
+        adapters_to_merge.append((module_map, adapter_config))
+        merged_weight_names = merged_weight_names.union(adapter_weight_names)
+        if tokenizer is None:
+            tokenizer = adapter_tokenizer
+
+    if len(adapters_to_merge) == 0:
+        raise ValueError("No adapters to merge.")
+
+    module_map, adapter_config = merge_adapters(adapters_to_merge, params)
+    return module_map, adapter_config, merged_weight_names, tokenizer
+
+
+def check_architectures(
+    model_id: str,
+    adapter_id: str,
+    adapter_config: "AdapterConfig",
+    trust_remote_code: bool = False,
+):
+    try:
+        if not adapter_config.base_model_name_or_path:
+            # Avoid execution latency caused by the network connection retrying for AutoConfig.from_pretrained(None)
+            return
+
+        expected_config = AutoConfig.from_pretrained(
+            model_id, trust_remote_code=trust_remote_code
+        )
+        model_config = AutoConfig.from_pretrained(
+            adapter_config.base_model_name_or_path, trust_remote_code=trust_remote_code
+        )
+    except Exception as e:
+        warnings.warn(
+            f"Unable to check architecture compatibility for adapter '{adapter_id}' "
+            f"against model '{model_id}'. Assuming they are compatible. Error: {e}"
+        )
+        return
+
+    if model_config.architectures == expected_config.architectures:
+        warnings.warn(
+            f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
+            f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
+        )
+    else:
+        # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
+        raise ValueError(
+            f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
+            f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
+            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead."
+        )
+
+
+@lru_cache(maxsize=128)
+def load_module_map(
+    model_id: str,
+    adapter_id: str,
+    adapter_source: str,
+    weight_names: Tuple[str],
+    api_token: str,
+    trust_remote_code: bool = False,
+) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
+    revision = "main"
+
+    adapter_config = LoraConfig.load(adapter_id, api_token)
+    if adapter_config.base_model_name_or_path != model_id:
+        check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)
+
+    adapter_filenames = hub._cached_adapter_weight_files(
+        adapter_id, revision=revision, extension=".safetensors"
+    )
+
+    try:
+        adapter_tokenizer = AutoTokenizer.from_pretrained(
+            adapter_config.config_path,
+            token=api_token,
+            trust_remote_code=trust_remote_code,
+        )
+    except Exception:
+        # Adapter does not have a tokenizer, so fallback to base model tokenizer
+        adapter_tokenizer = None
+
+    # load adapter weights from all shards (should have relatively small memory footprint)
+    adapter_weights = {}
+    for filename in adapter_filenames:
+        adapter_weights.update(load_file(filename))
+
+    # map the model weights to the relevant adapter weights (LoRA A and B matrices)
+    module_map, adapter_weight_names = adapter_config.map_weights_for_model(
+        adapter_weights, weight_names
+    )
+    return module_map, adapter_config, adapter_weight_names, adapter_tokenizer
diff --git a/server/text_generation_server/utils/chunks.py b/server/text_generation_server/utils/chunks.py
new file mode 100644
index 00000000..73962ea3
--- /dev/null
+++ b/server/text_generation_server/utils/chunks.py
@@ -0,0 +1,27 @@
+from typing import Iterable
+
+from loguru import logger
+
+from text_generation_server.pb import generate_pb2
+
+
+def concat_text_chunks(chunks: Iterable[generate_pb2.InputChunk]) -> str:
+    """
+    Concatenate text in text chunks. Non-text chunks are dropped.
+    """
+    text = None
+    for chunk in chunks:
+        chunk_type = chunk.WhichOneof("chunk")
+        if chunk_type == "text":
+            if text is None:
+                text = chunk.text
+            else:
+                raise NotImplementedError("Request contained more than one text chunk")
+        else:
+            # We cannot reject this, e.g. warmup sends an image chunk.
+            logger.debug(f"Encountered non-text chunk type {chunk_type}")
+
+    if text is None:
+        raise NotImplementedError("Request without a text chunk")
+
+    return text
diff --git a/server/text_generation_server/utils/convert.py b/server/text_generation_server/utils/convert.py
index 8d414eca..d9c3276b 100644
--- a/server/text_generation_server/utils/convert.py
+++ b/server/text_generation_server/utils/convert.py
@@ -29,9 +29,15 @@ def _remove_duplicate_names(
             [name for name in shared if _is_complete(state_dict[name])]
         )
         if not complete_names:
-            raise RuntimeError(
-                f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
-            )
+            if len(shared) == 1:
+                # Force contiguous
+                name = list(shared)[0]
+                state_dict[name] = state_dict[name].clone()
+                complete_names = {name}
+            else:
+                raise RuntimeError(
+                    f"Error while trying to find names to remove to save state dict, but found no suitable name to keep for saving amongst: {shared}. None is covering the entire storage.Refusing to save/load the model since you could be storing much more memory than needed. Please refer to https://huggingface.co/docs/safetensors/torch_shared_tensors for more information. Or open an issue."
+                )
 
         keep_name = sorted(list(complete_names))[0]
 
@@ -62,7 +68,7 @@ def convert_file(pt_file: Path, sf_file: Path, discard_names: List[str]):
     Forcing us to check for potentially different keys during load when looking
     for specific tensors (making tensor sharing explicit).
     """
-    loaded = torch.load(pt_file, map_location="cpu")
+    loaded = torch.load(pt_file, map_location="cpu", weights_only=True)
     if "state_dict" in loaded:
         loaded = loaded["state_dict"]
     to_removes = _remove_duplicate_names(loaded, discard_names=discard_names)
diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
index d02bfc5b..36d63e86 100644
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@@ -3,6 +3,7 @@ import torch
 
 from datetime import timedelta
 from loguru import logger
+from text_generation_server.utils.import_utils import SYSTEM
 
 # Tensor Parallelism settings
 RANK = int(os.getenv("RANK", "0"))
@@ -68,13 +69,24 @@ def initialize_torch_distributed():
 
         if not torch.distributed.is_initialized():
             # Call the init process.
-            torch.distributed.init_process_group(
-                backend=backend,
-                world_size=WORLD_SIZE,
-                rank=RANK,
-                timeout=timedelta(seconds=60),
-                pg_options=options,
-            )
+            if SYSTEM == "ipex":
+                import intel_extension_for_pytorch as ipex
+
+                ipex.distributed.init_process_group(
+                    backend="ccl",
+                    world_size=WORLD_SIZE,
+                    rank=RANK,
+                    timeout=timedelta(seconds=60),
+                    pg_options=options,
+                )
+            else:
+                torch.distributed.init_process_group(
+                    backend=backend,
+                    world_size=WORLD_SIZE,
+                    rank=RANK,
+                    timeout=timedelta(seconds=60),
+                    pg_options=options,
+                )
         else:
             logger.warning("torch.distributed is already initialized.")
 
diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
deleted file mode 100644
index c472d1fc..00000000
--- a/server/text_generation_server/utils/flash_attn.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import os
-import torch
-
-from loguru import logger
-
-if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
-    raise ImportError("`USE_FLASH_ATTENTION` is false.")
-
-if not torch.cuda.is_available():
-    raise ImportError("CUDA is not available")
-
-major, minor = torch.cuda.get_device_capability()
-is_sm75 = major == 7 and minor == 5
-is_sm8x = major == 8 and minor >= 0
-is_sm90 = major == 9 and minor == 0
-
-HAS_FLASH_ATTN = False
-HAS_FLASH_ATTN_V2 = False
-try:
-    try:
-        import flash_attn_2_cuda
-    except ImportError:
-        raise ImportError(
-            "Flash Attention V2 is not installed.\n"
-            "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-            "or install flash attention v2 with `cd server && make install install-flash-attention-v2`"
-        )
-    if not (is_sm8x or is_sm90):
-        raise ImportError(
-            f"GPU with CUDA capability {major} {minor} is not supported for "
-            "Flash Attention V2"
-        )
-    HAS_FLASH_ATTN_V2 = True
-except ImportError as e:
-    try:
-        import flash_attn_cuda
-    except ImportError:
-        raise ImportError(
-            "Flash Attention is not installed.\n"
-            "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
-            "or install flash attention with `cd server && make install install-flash-attention`"
-        ) from e
-
-    if not (is_sm75 or is_sm8x or is_sm90):
-        raise ImportError(
-            f"GPU with CUDA capability {major} {minor} is not supported"
-        ) from e
-    logger.warning(f"Unable to use Flash Attention V2: {e}")
-    HAS_FLASH_ATTN = True
-
-
-def attention(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens,
-    max_s,
-    softmax_scale,
-):
-    if HAS_FLASH_ATTN_V2:
-        return flash_attn_2_cuda.varlen_fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            None,
-        )
-
-    if HAS_FLASH_ATTN:
-        # Flash attention v1 requires q, k and v to have the same number of heads
-        if k.shape[1] != q.shape[1]:
-            # MQA expand
-            if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = k.shape
-                k = (
-                    k.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-        if v.shape[1] != q.shape[1]:
-            # MQA expand
-            if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1)
-            # Grouped attention reshape
-            else:
-                original_shape = v.shape
-                v = (
-                    v.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
-                    .reshape(original_shape[0], -1, original_shape[2])
-                )
-
-        return flash_attn_cuda.fwd(
-            q,
-            k,
-            v,
-            out,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            0.0,
-            softmax_scale,
-            False,
-            True,
-            False,
-            0,
-            None,
-        )
-
-    raise NotImplementedError("flash attention is not installed")
diff --git a/server/text_generation_server/utils/gptq/quant_linear.py b/server/text_generation_server/utils/gptq/quant_linear.py
deleted file mode 100644
index bfc91c00..00000000
--- a/server/text_generation_server/utils/gptq/quant_linear.py
+++ /dev/null
@@ -1,359 +0,0 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-try:
-    import triton
-    import triton.language as tl
-    from . import custom_autotune
-
-    # code based https://github.com/fpgaminer/GPTQ-triton
-    @custom_autotune.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 64,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 128,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=2,
-                num_warps=4,
-            ),
-        ],
-        key=["M", "N", "K"],
-        nearest_power_of_two=True,
-        prune_configs_by={
-            "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
-            "perf_model": None,
-            "top_k": None,
-        },
-    )
-    @triton.jit
-    def matmul_248_kernel(
-        a_ptr,
-        b_ptr,
-        c_ptr,
-        scales_ptr,
-        zeros_ptr,
-        g_ptr,
-        M,
-        N,
-        K,
-        bits,
-        maxq,
-        stride_am,
-        stride_ak,
-        stride_bk,
-        stride_bn,
-        stride_cm,
-        stride_cn,
-        stride_scales,
-        stride_zeros,
-        BLOCK_SIZE_M: tl.constexpr,
-        BLOCK_SIZE_N: tl.constexpr,
-        BLOCK_SIZE_K: tl.constexpr,
-        GROUP_SIZE_M: tl.constexpr,
-    ):
-        """
-        Compute the matrix multiplication C = A x B.
-        A is of shape (M, K) float16
-        B is of shape (K//8, N) int32
-        C is of shape (M, N) float16
-        scales is of shape (G, N) float16
-        zeros is of shape (G, N) float16
-        g_ptr is of shape (K) int32
-        """
-        infearure_per_bits = 32 // bits
-
-        pid = tl.program_id(axis=0)
-        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + (pid % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
-
-        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (
-            offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak
-        )  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        a_mask = offs_am[:, None] < M
-        # b_ptrs is set up such that it repeats elements along the K axis 8 times
-        b_ptrs = b_ptr + (
-            (offs_k[:, None] // infearure_per_bits) * stride_bk
-            + offs_bn[None, :] * stride_bn
-        )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-        g_ptrs = g_ptr + offs_k
-        # shifter is used to extract the N bits of each element in the 32-bit word from B
-        scales_ptrs = scales_ptr + offs_bn[None, :]
-        zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-        shifter = (offs_k % infearure_per_bits) * bits
-        zeros_shifter = (offs_bn % infearure_per_bits) * bits
-        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-        for k in range(0, num_pid_k):
-            g_idx = tl.load(g_ptrs)
-
-            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-            scales = tl.load(
-                scales_ptrs + g_idx[:, None] * stride_scales
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-            zeros = tl.load(
-                zeros_ptrs + g_idx[:, None] * stride_zeros
-            )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-            zeros = (zeros >> zeros_shifter[None, :]) & maxq
-            zeros = zeros + 1
-
-            a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-            b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-            # Now we need to unpack b (which is N-bit values) into 32-bit values
-            b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-            b = (b - zeros) * scales  # Scale and shift
-
-            accumulator += tl.dot(a, b)
-            a_ptrs += BLOCK_SIZE_K
-            b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-            g_ptrs += BLOCK_SIZE_K
-
-        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-        tl.store(c_ptrs, accumulator, mask=c_mask)
-
-except:
-    print("triton not installed.")
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    with torch.cuda.device(input.device):
-        output = torch.empty(
-            (input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16
-        )
-        grid = lambda META: (
-            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
-            * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
-        )
-        matmul_248_kernel[grid](
-            input,
-            qweight,
-            output,
-            scales,
-            qzeros,
-            g_idx,
-            input.shape[0],
-            qweight.shape[1],
-            input.shape[1],
-            bits,
-            maxq,
-            input.stride(0),
-            input.stride(1),
-            qweight.stride(0),
-            qweight.stride(1),
-            output.stride(0),
-            output.stride(1),
-            scales.stride(0),
-            qzeros.stride(0),
-        )
-        return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        return output
-
-
-class QuantLinear(nn.Module):
-    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
-        super().__init__()
-        self.register_buffer("qweight", qweight)
-        self.register_buffer("qzeros", qzeros)
-        self.register_buffer("scales", scales)
-        self.register_buffer("g_idx", g_idx)
-        if bias is not None:
-            self.register_buffer("bias", bias)
-        else:
-            self.bias = None
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.bits = bits
-        self.maxq = 2**self.bits - 1
-        self.groupsize = groupsize
-
-        self.outfeatures = qweight.shape[1]
-        self.infeatures = qweight.shape[0] * 32 // bits
-
-    @classmethod
-    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
-        qzeros = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
-            dtype=torch.int32,
-        )
-        scales = torch.zeros(
-            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
-        )
-        g_idx = torch.tensor(
-            [i // groupsize for i in range(infeatures)], dtype=torch.int32
-        )
-        if bias:
-            bias = torch.zeros((outfeatures), dtype=torch.float16)
-        else:
-            bias = None
-        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
-                    / self.scales[self.g_idx[idx]]
-                ).to(torch.int)[:, None]
-            )
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros(
-            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
-        )
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros(
-            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
-        )
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
-            x.reshape(-1, x.shape[-1]),
-            self.qweight,
-            self.scales,
-            self.qzeros,
-            self.g_idx,
-            self.bits,
-            self.maxq,
-        )
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
diff --git a/server/text_generation_server/utils/hub.py b/server/text_generation_server/utils/hub.py
index 23743c9b..db412aeb 100644
--- a/server/text_generation_server/utils/hub.py
+++ b/server/text_generation_server/utils/hub.py
@@ -6,24 +6,44 @@ from loguru import logger
 from pathlib import Path
 from typing import Optional, List
 
-from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub import file_download, hf_api, HfApi, hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from huggingface_hub.utils import (
     LocalEntryNotFoundError,
     EntryNotFoundError,
-    RevisionNotFoundError,  # Import here to ease try/except in other part of the lib
+    RevisionNotFoundError,  # noqa # Import here to ease try/except in other part of the lib
 )
 
 WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
+HF_HUB_OFFLINE = os.environ.get("HF_HUB_OFFLINE", "0").lower() in ["true", "1", "yes"]
 
 
-def weight_hub_files(
-    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+def _cached_adapter_weight_files(
+    adapter_id: str, revision: Optional[str], extension: str
 ) -> List[str]:
-    """Get the weights filenames on the hub"""
-    api = HfApi()
-    info = api.model_info(model_id, revision=revision)
-    filenames = [
+    """Guess weight files from the cached revision snapshot directory"""
+    d = _get_cached_revision_directory(adapter_id, revision)
+    if not d:
+        return []
+    filenames = _adapter_weight_files_from_dir(d, extension)
+    return filenames
+
+
+def _cached_weight_files(
+    model_id: str, revision: Optional[str], extension: str
+) -> List[str]:
+    """Guess weight files from the cached revision snapshot directory"""
+    d = _get_cached_revision_directory(model_id, revision)
+    if not d:
+        return []
+    filenames = _weight_files_from_dir(d, extension)
+    return filenames
+
+
+def _weight_hub_files_from_model_info(
+    info: hf_api.ModelInfo, extension: str
+) -> List[str]:
+    return [
         s.rfilename
         for s in info.siblings
         if s.rfilename.endswith(extension)
@@ -33,24 +53,60 @@ def weight_hub_files(
         and "training" not in s.rfilename
     ]
 
-    if not filenames:
-        raise EntryNotFoundError(
-            f"No {extension} weights found for model {model_id} and revision {revision}.",
-            None,
-        )
 
+def _weight_files_from_dir(d: Path, extension: str) -> List[str]:
+    # os.walk: do not iterate, just scan for depth 1, not recursively
+    # see _weight_hub_files_from_model_info, that's also what is
+    # done there with the len(s.rfilename.split("/")) == 1 condition
+    root, _, files = next(os.walk(str(d)))
+    filenames = [
+        os.path.join(root, f)
+        for f in files
+        if f.endswith(extension)
+        and "arguments" not in f
+        and "args" not in f
+        and "adapter" not in f
+        and "training" not in f
+    ]
     return filenames
 
 
-def try_to_load_from_cache(
-    model_id: str, revision: Optional[str], filename: str
+def _adapter_weight_files_from_dir(d: Path, extension: str) -> List[str]:
+    # os.walk: do not iterate, just scan for depth 1, not recursively
+    # see _weight_files_from_dir, that's also what is done there
+    root, _, files = next(os.walk(str(d)))
+    filenames = [
+        os.path.join(root, f)
+        for f in files
+        if f.endswith(extension)
+        and "arguments" not in f
+        and "args" not in f
+        and "training" not in f
+    ]
+    return filenames
+
+
+def _adapter_config_files_from_dir(d: Path) -> List[str]:
+    # os.walk: do not iterate, just scan for depth 1, not recursively
+    # see _weight_files_from_dir, that's also what is done there
+    root, _, files = next(os.walk(str(d)))
+    filenames = [
+        os.path.join(root, f)
+        for f in files
+        if f.endswith(".json") and "arguments" not in f and "args" not in f
+    ]
+    return filenames
+
+
+def _get_cached_revision_directory(
+    model_id: str, revision: Optional[str]
 ) -> Optional[Path]:
-    """Try to load a file from the Hugging Face cache"""
     if revision is None:
         revision = "main"
 
-    object_id = model_id.replace("/", "--")
-    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / f"models--{object_id}"
+    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / Path(
+        file_download.repo_folder_name(repo_id=model_id, repo_type="model")
+    )
 
     if not repo_cache.is_dir():
         # No cache for this model
@@ -74,8 +130,42 @@ def try_to_load_from_cache(
         # No cache for this revision and we won't try to return a random revision
         return None
 
+    return snapshots_dir / revision
+
+
+def weight_hub_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[str]:
+    """Get the weights filenames on the hub"""
+    api = HfApi()
+
+    if HF_HUB_OFFLINE:
+        filenames = _cached_weight_files(model_id, revision, extension)
+    else:
+        # Online case, fetch model info from the Hub
+        info = api.model_info(model_id, revision=revision)
+        filenames = _weight_hub_files_from_model_info(info, extension)
+
+    if not filenames:
+        raise EntryNotFoundError(
+            f"No {extension} weights found for model {model_id} and revision {revision}.",
+            None,
+        )
+
+    return filenames
+
+
+def try_to_load_from_cache(
+    model_id: str, revision: Optional[str], filename: str
+) -> Optional[Path]:
+    """Try to load a file from the Hugging Face cache"""
+
+    d = _get_cached_revision_directory(model_id, revision)
+    if not d:
+        return None
+
     # Check if file exists in cache
-    cached_file = snapshots_dir / revision / filename
+    cached_file = d / filename
     return cached_file if cached_file.is_file() else None
 
 
@@ -84,13 +174,14 @@ def weight_files(
 ) -> List[Path]:
     """Get the local files"""
     # Local model
-    if Path(model_id).exists() and Path(model_id).is_dir():
-        local_files = list(Path(model_id).glob(f"*{extension}"))
+    d = Path(model_id)
+    if d.exists() and d.is_dir():
+        local_files = _weight_files_from_dir(d, extension)
         if not local_files:
             raise FileNotFoundError(
                 f"No local weights found in {model_id} with extension {extension}"
             )
-        return local_files
+        return [Path(f) for f in local_files]
 
     try:
         filenames = weight_hub_files(model_id, revision, extension)
@@ -138,33 +229,33 @@ def download_weights(
 ) -> List[Path]:
     """Download the safetensors files from the hub"""
 
-    def download_file(filename, tries=5, backoff: int = 5):
-        local_file = try_to_load_from_cache(model_id, revision, filename)
+    def download_file(fname, tries=5, backoff: int = 5):
+        local_file = try_to_load_from_cache(model_id, revision, fname)
         if local_file is not None:
-            logger.info(f"File {filename} already present in cache.")
+            logger.info(f"File {fname} already present in cache.")
             return Path(local_file)
 
-        for i in range(tries):
+        for idx in range(tries):
             try:
-                logger.info(f"Download file: {filename}")
-                start_time = time.time()
+                logger.info(f"Download file: {fname}")
+                stime = time.time()
                 local_file = hf_hub_download(
-                    filename=filename,
+                    filename=fname,
                     repo_id=model_id,
                     revision=revision,
-                    local_files_only=False,
+                    local_files_only=HF_HUB_OFFLINE,
                 )
                 logger.info(
-                    f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - start_time))}."
+                    f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - stime))}."
                 )
                 return Path(local_file)
             except Exception as e:
-                if i + 1 == tries:
+                if idx + 1 == tries:
                     raise e
                 logger.error(e)
                 logger.info(f"Retrying in {backoff} seconds")
                 time.sleep(backoff)
-                logger.info(f"Retry {i + 1}/{tries - 1}")
+                logger.info(f"Retry {idx + 1}/{tries - 1}")
 
     # We do this instead of using tqdm because we want to parse the logs with the launcher
     start_time = time.time()
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
new file mode 100644
index 00000000..011e0f63
--- /dev/null
+++ b/server/text_generation_server/utils/import_utils.py
@@ -0,0 +1,75 @@
+import torch
+from loguru import logger
+import subprocess
+import os
+
+
+def is_ipex_available():
+    try:
+        import intel_extension_for_pytorch
+    except ImportError:
+        return False
+    return True
+
+
+def get_cuda_free_memory(device, memory_fraction):
+    total_free_memory, _ = torch.cuda.mem_get_info(device)
+    total_gpu_memory = torch.cuda.get_device_properties(device).total_memory
+    free_memory = max(0, total_free_memory - (1 - memory_fraction) * total_gpu_memory)
+    return free_memory
+
+
+def get_xpu_free_memory(device, memory_fraction):
+    total_memory = torch.xpu.get_device_properties(device).total_memory
+    device_id = device.index
+    memory_fraction = float(os.getenv("XPU_MEMORY_FRACTION", "1.0"))
+    free_memory = max(
+        0,
+        int(
+            total_memory * 0.9 * memory_fraction - torch.xpu.memory_reserved(device_id)
+        ),
+    )
+    return free_memory
+
+
+def get_cpu_free_memory(device, memory_fraction):
+    import psutil
+    from text_generation_server.utils.dist import WORLD_SIZE
+
+    mem = psutil.virtual_memory()
+    free_memory = int(mem.available * 0.95 / WORLD_SIZE)
+    return free_memory
+
+
+def noop(*args, **kwargs):
+    pass
+
+
+SYSTEM = None
+if torch.version.hip is not None:
+    SYSTEM = "rocm"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    SYSTEM = "cuda"
+    empty_cache = torch.cuda.empty_cache
+    synchronize = torch.cuda.synchronize
+    get_free_memory = get_cuda_free_memory
+elif is_ipex_available():
+    SYSTEM = "ipex"
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        empty_cache = torch.xpu.empty_cache
+        synchronize = torch.xpu.synchronize
+        get_free_memory = get_xpu_free_memory
+    else:
+        empty_cache = noop
+        synchronize = noop
+        get_free_memory = get_cpu_free_memory
+else:
+    SYSTEM = "cpu"
+
+    empty_cache = noop
+    synchronize = noop
+    get_free_memory = get_cpu_free_memory
+logger.info(f"Detected system {SYSTEM}")
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
deleted file mode 100644
index 745c1d2e..00000000
--- a/server/text_generation_server/utils/layers.py
+++ /dev/null
@@ -1,583 +0,0 @@
-import os
-import torch
-import torch.distributed
-
-from torch import nn
-from torch.nn import functional as F
-from typing import List
-
-HAS_BITS_AND_BYTES = True
-try:
-    import bitsandbytes as bnb
-    from bitsandbytes.nn import Int8Params, Params4bit
-
-except ImportError:
-    HAS_BITS_AND_BYTES = False
-
-from accelerate import init_empty_weights
-
-from text_generation_server.utils.gptq.quant_linear import QuantLinear
-
-HAS_EXLLAMA = True
-if os.getenv("DISABLE_EXLLAMA") == "True":
-    HAS_EXLLAMA = False
-try:
-    from text_generation_server.utils.gptq.exllama import Ex4bitLinear
-except ImportError:
-    HAS_EXLLAMA = False
-
-from typing import Optional
-
-# Monkey patching
-@classmethod
-def load_layer_norm(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = nn.Parameter(bias)
-    return ln
-
-
-@classmethod
-def load_layer_norm_no_bias(cls, prefix, weights, eps):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        ln = cls(weight.shape, eps=eps)
-
-    ln.weight = nn.Parameter(weight)
-    ln.bias = None
-    return ln
-
-@classmethod
-def load_conv2d(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    bias = weights.get_tensor(f"{prefix}.bias")
-    with init_empty_weights():
-        conv2d = cls(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = nn.Parameter(bias)
-    return conv2d
-
-
-@classmethod
-def load_conv2d_no_bias(cls, prefix, weights, in_channels, out_channels, kernel_size, stride):
-    weight = weights.get_tensor(f"{prefix}.weight")
-    with init_empty_weights():
-        conv2d = cls(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride)
-
-    conv2d.weight = nn.Parameter(weight)
-    conv2d.bias = None
-    return conv2d
-
-
-torch.nn.Conv2d.load = load_conv2d
-torch.nn.Conv2d.load_no_bias = load_conv2d_no_bias
-torch.nn.LayerNorm.load = load_layer_norm
-torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
-
-
-class FastLinear(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(weight)
-        if bias is not None:
-            self.bias = nn.Parameter(bias)
-        else:
-            self.bias = None
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_tensor(f"{prefix}.weight")
-        if bias:
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(weight, bias)
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return F.linear(input, self.weight, self.bias)
-
-
-class Linear8bitLt(nn.Module):
-    def __init__(
-        self,
-        weight,
-        bias,
-        has_fp16_weights=True,
-        memory_efficient_backward=False,
-        threshold=0.0,
-        index=None,
-    ):
-        super().__init__()
-        assert (
-            not memory_efficient_backward
-        ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        # Necessary for stacked layers
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(
-            weight.data,
-            has_fp16_weights=has_fp16_weights,
-            requires_grad=has_fp16_weights,
-        )
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x: torch.Tensor):
-        self.state.is_training = self.training
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
-
-        if not self.state.has_fp16_weights:
-            if self.state.CB is not None and self.state.CxB is not None:
-                # we converted 8-bit row major to turing/ampere format in the first inference pass
-                # we no longer need the row-major weight
-                del self.state.CB
-                self.weight.data = self.state.CxB
-        return out
-
-
-class Linear4bit(nn.Module):
-    def __init__(self, weight, bias, quant_type):
-        super().__init__()
-        self.weight = Params4bit(
-            weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
-        )
-        self.compute_dtype = None
-        self.weight.cuda(weight.device)
-        self.bias = bias
-
-    def forward(self, x: torch.Tensor):
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        if getattr(self.weight, "quant_state", None) is None:
-            print(
-                "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
-            )
-        inp_dtype = x.dtype
-        if self.compute_dtype is not None:
-            x = x.to(self.compute_dtype)
-
-        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(
-            x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
-        )
-
-        out = out.to(inp_dtype)
-
-        return out
-
-
-def get_linear(weight, bias, quantize):
-    if quantize is None:
-        linear = FastLinear(weight, bias)
-    elif quantize == "bitsandbytes":
-        linear = Linear8bitLt(
-            weight,
-            bias,
-            has_fp16_weights=False,
-            threshold=6.0,
-        )
-        if bias is not None:
-            linear.bias = nn.Parameter(bias)
-    elif quantize == "bitsandbytes-fp4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="fp4",
-        )
-    elif quantize == "bitsandbytes-nf4":
-        linear = Linear4bit(
-            weight,
-            bias,
-            quant_type="nf4",
-        )
-    elif quantize == "gptq":
-        try:
-            qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama = weight
-        except Exception:
-            raise NotImplementedError(
-                f"The passed weight is not `gptq` compatible, loader needs to be updated."
-            )
-
-        if use_exllama:
-            linear = Ex4bitLinear(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
-        else:
-            linear = QuantLinear(
-                qweight,
-                qzeros,
-                scales,
-                g_idx,
-                bias,
-                bits,
-                groupsize,
-            )
-    else:
-        raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.")
-    return linear
-
-
-class SuperLayer(nn.Module):
-    def __init__(self, linear):
-        super().__init__()
-        self.linear = linear
-
-    def forward(self, x):
-        return self.linear.forward(x)
-
-
-class TensorParallelHead(SuperLayer):
-    def __init__(self, linear, process_group, should_gather: bool):
-        super().__init__(linear)
-        self.process_group = process_group
-        self.should_gather = should_gather
-
-    @staticmethod
-    def load(config, prefix: str, weights):
-        if weights.process_group.size() > 1:
-            try:
-                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
-                should_gather = True
-            except AssertionError:
-                # If the vocab size is not divisible by number of shards
-                # just load the entire thing.
-                weight = weights.get_tensor(f"{prefix}.weight")
-                should_gather = False
-        else:
-            weight = weights.get_tensor(f"{prefix}.weight")
-            should_gather = False
-
-        # GPTQ doesn't quantize heads (nor embeddings)
-        if config.quantize == "gptq":
-            quantize = None
-        else:
-            quantize = config.quantize
-        return TensorParallelHead(
-            get_linear(weight, bias=None, quantize=quantize),
-            process_group=weights.process_group,
-            should_gather=should_gather,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if not self.should_gather:
-            return super().forward(input)
-
-        world_size = self.process_group.size()
-        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
-            out_dim = self.linear.weight.shape[0]
-
-            if input.shape[0] == 1:
-                world_out = input.new_empty(1, out_dim * world_size)
-                local_out = input.new_empty(1, out_dim)
-                gather_input = local_out
-            else:
-                world_out = input.new_empty(out_dim * world_size, input.shape[0])
-                gather_input = input.new_empty(out_dim, input.shape[0])
-                local_out = gather_input.T
-
-            torch.mm(input, self.linear.weight.T, out=local_out)
-
-            torch.distributed.all_gather_into_tensor(
-                world_out, gather_input, group=self.process_group
-            )
-
-            if input.shape[0] == 1:
-                return world_out
-            return world_out.T
-
-        output = super().forward(input)
-        world_output = [
-            torch.empty_like(output) for _ in range(self.process_group.size())
-        ]
-        torch.distributed.all_gather(world_output, output, group=self.process_group)
-        world_output = torch.cat(world_output, dim=-1)
-        return world_output
-
-
-class TensorParallelColumnLinear(SuperLayer):
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        return cls.load_multi(config, [prefix], weights, bias, dim=0)
-
-    @classmethod
-    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
-        weight = weights.get_multi_weights_col(
-            prefixes, quantize=config.quantize, dim=dim
-        )
-
-        if bias:
-            b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
-            bias = torch.cat(b, dim=dim)
-        else:
-            bias = None
-        linear = get_linear(weight, bias, config.quantize)
-        return cls(linear)
-
-
-class TensorParallelRowLinear(SuperLayer):
-    def __init__(self, linear, process_group):
-        super().__init__(linear)
-        self.process_group = process_group
-
-    @classmethod
-    def load(cls, config, prefix: str, weights, bias: bool):
-        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
-
-        if bias and weights.process_group.rank() == 0:
-            # Rank is only on the first rank process
-            bias = weights.get_tensor(f"{prefix}.bias")
-        else:
-            bias = None
-        return cls(
-            get_linear(weight, bias, config.quantize),
-            process_group=weights.process_group,
-        )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        out = super().forward(input)
-        if self.process_group.size() > 1:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-class TensorParallelEmbedding(nn.Module):
-    def __init__(self, prefix: str, weights, reduce=True):
-        super().__init__()
-        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
-        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
-
-        process_group = weights.process_group
-
-        world_size = process_group.size()
-        rank = process_group.rank()
-
-        block_size = num_embeddings // world_size
-        self.min_id = rank * block_size
-        self.max_id = min(num_embeddings, (rank + 1) * block_size)
-        self.null_idx = block_size
-        self.process_group = weights.process_group
-        self.reduce = reduce
-
-        """Additional 0 entry used for masking"""
-        self.weight = nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
-        # translate for [0, self.max_id - self.min_id[
-        input = torch.where(
-            (self.min_id > input) | (input >= self.max_id),
-            self.null_idx,
-            input - self.min_id,
-        )
-        out = torch.nn.functional.embedding(input, self.weight)
-        if self.reduce and self.process_group.size() > 1:
-            torch.distributed.all_reduce(out, group=self.process_group)
-        return out
-
-
-try:
-    import dropout_layer_norm
-
-    class FastLayerNorm(nn.LayerNorm):
-        def forward(self, hidden_states, residual=None):
-            if hidden_states.shape[-1] > 8192:
-                if residual is not None:
-                    hidden_states += residual
-                residual = hidden_states
-
-                return super(FastLayerNorm, self).forward(hidden_states), residual
-            else:
-                (
-                    normed_hidden_states,
-                    residual,
-                    *rest,
-                ) = dropout_layer_norm.dropout_add_ln_fwd(
-                    hidden_states,
-                    residual,
-                    self.weight,
-                    self.bias,
-                    None,
-                    None,
-                    None,
-                    None,
-                    0.0,
-                    self.eps,
-                    1.0,
-                    0,
-                    None,
-                    False,
-                    False,
-                )
-                if residual is None:
-                    residual = hidden_states
-
-                return normed_hidden_states, residual
-
-except ImportError:
-    pass
-
-
-try:
-    from flash_attn.layers.rotary import RotaryEmbedding
-    import rotary_emb
-
-    def _create_inv_freq(dim, base, device):
-        inv_freq = 1.0 / (
-            base
-            ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
-        )
-        return inv_freq
-
-    def _get_rope_config(config):
-        if os.getenv("ROPE_SCALING", None) is not None:
-            rope_scaling = {"type": os.environ["ROPE_SCALING"], "factor": float(os.environ["ROPE_FACTOR"])}
-            return rope_scaling
-        return getattr(config, "rope_scaling", None)
-
-    class PositionRotaryEmbedding(nn.Module):
-        def __init__(self, inv_freq, scaling_factor):
-            super().__init__()
-            self.inv_freq = inv_freq
-            self._seq_len_cached = 0
-            self._cos_cached = None
-            self._sin_cached = None
-            self._cos_k_cached = None
-            self._sin_k_cached = None
-            self.scaling_factor = scaling_factor
-            self.dynamic_args = None
-
-        @classmethod
-        def static(cls, config, dim, base, device):
-            inv_freq = _create_inv_freq(dim, base, device)
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(dim=dim, max_position_embeddings=config.max_position_embeddings, base=base, device=inv_freq.device, scaling_factor=scaling_factor)
-                else:
-                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
-            return cls(inv_freq, scaling_factor)
-
-        @classmethod
-        def load(cls, config, prefix, weights):
-            # XXX: Always load this in float32 !
-            dtype = weights.dtype
-            weights.dtype = torch.float32
-            inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
-            weights.dtype = dtype
-
-            scaling_factor = None
-            rope_scaling = _get_rope_config(config)
-            if rope_scaling is not None:
-                scaling_factor = rope_scaling["factor"]
-                if rope_scaling["type"] == "linear":
-                    pass
-                elif rope_scaling["type"] == "dynamic":
-                    return DynamicPositionRotaryEmbedding(dim=2*inv_freq.shape[0], max_position_embeddings=config.max_position_embeddings, base=10000.0, device=inv_freq.device, scaling_factor=scaling_factor)
-                else:
-                    raise NotImplementedError(f"rope scaling type {rope_scaling['type']} is not implemented or invalid")
-            return cls(inv_freq, scaling_factor)
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                if self.scaling_factor is not None:
-                    t /= self.scaling_factor
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-        def get_cos_sin(
-            self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype
-        ):
-            """
-            Return cos and sin for the asked position ids
-            """
-
-            self._update_cos_sin_cache(dtype, position_ids.device, max_s)
-
-            cos = torch.index_select(self._cos_cached, 0, position_ids)
-            sin = torch.index_select(self._sin_cached, 0, position_ids)
-            return cos.unsqueeze(1), sin.unsqueeze(1)
-
-        def forward(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
-            rotary_dim = cos.shape[-1]
-            x1 = x[..., :rotary_dim]
-            x2 = x[..., rotary_dim : 2 * rotary_dim]
-
-            rotary_emb.apply_rotary(x1, x2, cos, sin, x1, x2, False)
-            return x
-
-    class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
-            inv_freq = _create_inv_freq(dim, base, device)
-            super().__init__(inv_freq, scaling_factor)
-            self.dim = dim
-            self.max_position_embeddings = max_position_embeddings
-            self.base = base
-
-        def _update_cos_sin_cache(self, dtype, device, seqlen):
-            # Reset the tables if the sequence length has changed,
-            # or if we're on a new device (possibly due to tracing for instance)
-            if (
-                seqlen > self._seq_len_cached
-                or self._cos_cached.device != device
-                or self._cos_cached.dtype != dtype
-            ):
-                if seqlen > self.max_position_embeddings:
-                    newbase = self.base * ((self.scaling_factor * seqlen / self.max_position_embeddings) - (self.scaling_factor - 1)) ** (self.dim / (self.dim - 2))
-                    self.inv_freq = _create_inv_freq(self.dim, newbase, self.inv_freq.device)
-                self._seq_len_cached = seqlen
-                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-                # Don't do einsum, it converts fp32 to fp16
-                # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-
-                freqs = torch.outer(t, self.inv_freq.to(device=t.device))
-                self._cos_cached = torch.cos(freqs).to(dtype)
-                self._sin_cached = torch.sin(freqs).to(dtype)
-
-
-except ImportError:
-    pass
diff --git a/server/text_generation_server/utils/log.py b/server/text_generation_server/utils/log.py
new file mode 100644
index 00000000..b1456f1e
--- /dev/null
+++ b/server/text_generation_server/utils/log.py
@@ -0,0 +1,6 @@
+from functools import lru_cache
+
+
+@lru_cache(10)
+def log_once(log, msg: str):
+    log(msg)
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
index f424eae4..6b915437 100644
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@@ -1,8 +1,15 @@
 import math
 import torch
 
+from loguru import logger
+from typing import Dict, Union
+from text_generation_server.pb.generate_pb2 import GrammarType
+
+from outlines.fsm.fsm import RegexFSM
+from outlines.fsm.json_schema import build_regex_from_schema
 from functools import lru_cache
-from typing import Optional, List, Dict, Union
+from typing import List, Optional, DefaultDict
+import time
 
 from transformers import (
     LogitsWarper,
@@ -118,6 +125,69 @@ class HeterogeneousRepetitionPenaltyLogitsProcessor(LogitsProcessor):
         return None
 
 
+class FrequencyPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Frequency penalty as defined by OpenAI
+
+    Args:
+        penalty (`float`):
+            The parameter for frequency penalty. 0.0 means no penalty.
+    """
+
+    def __init__(self, penalty: float):
+        self.penalty = penalty
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        score = torch.gather(scores, 1, input_ids)
+        # if score < 0 then penalty has to be multiplied to reduce the previous token probability
+        score = -torch.where(score < 0, score * self.penalty, score / self.penalty)
+        # set score to 0 where input_ids is a padding token
+        score *= input_ids.ne(0)
+
+        return scores.scatter_add_(1, input_ids, score)
+
+
+class HeterogeneousFrequencyPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Frequency penalty as defined by OpenAI in
+    https://platform.openai.com/docs/guides/text-generation/parameter-details
+
+    Args:
+        frequency_penalty (`List[float]`):
+            The parameter for frequency penalty. 0.0 means no penalty.
+    """
+
+    def __init__(self, penalty: List[float], dtype: torch.dtype, device: torch.device):
+        self.penalty = penalty
+        self.penalty_tensor = torch.tensor(
+            penalty, dtype=dtype, device=device
+        ).unsqueeze(1)
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        batch_size, input_size = input_ids.size()
+        vocab_size = scores.size(1)
+
+        # Calculate the frequency for each token so far
+        token_freq = torch.zeros(batch_size, vocab_size, device=input_ids.device)
+        token_freq.scatter_add_(
+            1, input_ids, torch.ones_like(input_ids, dtype=torch.float)
+        )
+        token_freq /= input_size
+
+        # Apply the frequency penalty to logits
+        scores -= token_freq * self.penalty_tensor
+        return scores
+
+    def filter(self, indices):
+        self.penalty = [self.penalty[i] for i in indices]
+        if any([x != 0.0 for x in self.penalty]):
+            self.penalty_tensor = self.penalty_tensor[indices]
+            return self
+        return None
+
+
 class HeterogeneousTemperatureLogitsWarper:
     r"""
     [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
@@ -408,3 +478,133 @@ class HeterogeneousProcessorWrapper(LogitsProcessor):
             self.processors = new_processors
             return self
         return None
+
+
+class GrammarLogitProcessor(LogitsProcessor):
+    fsm_state: DefaultDict[int, int]
+    fsm: RegexFSM
+
+    def __init__(self, tokenizer, device, grammar, grammar_type):
+        self.device = device
+        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
+        self.fsm = GrammarLogitProcessor._cached_compile_fsm(
+            grammar_type, grammar, self.tokenizer
+        )
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        fsm_grammar_state: int,
+    ):
+        if fsm_grammar_state == -1 or self.fsm is None:
+            return logits
+        allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state)
+        mask = torch.full_like(logits, -math.inf)
+        mask[:, allowed_tokens] = 0
+        biased_scores = logits + mask
+        return biased_scores
+
+    def advance(self, next_token_id, fsm_grammar_state):
+        return GrammarLogitProcessor._advance(
+            next_token_id, fsm_grammar_state, self.fsm
+        )
+
+    @staticmethod
+    def _advance(next_token_id, fsm_grammar_state, fsm):
+        if fsm_grammar_state == -1:
+            return fsm_grammar_state
+        return fsm.next_state(fsm_grammar_state, next_token_id)
+
+    # TODO: move grammar compilation into the router
+    @staticmethod
+    @lru_cache(maxsize=32, typed=True)
+    def _cached_compile_fsm(grammar_type, schema, tokenizer):
+        start_time = time.time()
+        if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
+            schema = build_regex_from_schema(schema)
+        elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
+            pass  # schema is already a regex just here for clarity
+        fsm = RegexFSM(schema, tokenizer)
+        logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
+        return fsm
+
+    @staticmethod
+    @lru_cache(maxsize=32, typed=True)
+    def _cached_adapt_tokenizer(tokenizer):
+        """Adapt tokenizer to work with the FSM.
+
+        The API of Outlines tokenizers is slightly different to that of
+        `transformers`. In addition we need to handle the missing spaces to
+        Llama's tokenizer to be able to compile FSMs for this model.
+
+        """
+        start_time = time.time()
+        tokenizer.vocabulary = tokenizer.get_vocab()
+        tokenizer.special_tokens = set(tokenizer.all_special_tokens)
+
+        def convert_token_to_string(token: str) -> str:
+            from transformers.file_utils import SPIECE_UNDERLINE
+
+            string = tokenizer.convert_tokens_to_string([token])
+
+            # A hack to handle missing spaces to HF's Llama tokenizers
+            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+                return " " + string
+
+            return string
+
+        tokenizer.convert_token_to_string = convert_token_to_string
+        logger.debug(f"Adapted tokenizer in {time.time() - start_time:.2f}s")
+        return tokenizer
+
+
+class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
+    def __init__(self, tokenizer, device, grammars, grammar_types):
+        self.device = device
+        self.tokenizer = GrammarLogitProcessor._cached_adapt_tokenizer(tokenizer)
+        self.fsms = []
+        for grammar, grammar_type in zip(grammars, grammar_types):
+            if len(grammar) == 0:
+                self.fsms.append(None)
+                continue
+            fsm = GrammarLogitProcessor._cached_compile_fsm(
+                grammar_type, grammar, self.tokenizer
+            )
+            self.fsms.append(fsm)
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        fsm_grammar_states: List[int],
+    ):
+        mask = torch.full_like(logits, -math.inf)
+        for i in range(logits.shape[0]):
+            fsm = self.fsms[i]
+            if fsm_grammar_states[i] == -1 or fsm is None:
+                continue
+            allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i])
+            mask[i, allowed_tokens] = 0
+            logits[i] += mask[i]
+        return logits
+
+    def advance_batch(self, next_token_ids, fsm_grammar_states):
+        return [
+            GrammarLogitProcessor._advance(
+                next_token_ids[i], fsm_grammar_states[i], self.fsms[i]
+            )
+            for i in range(len(next_token_ids))
+        ]
+
+    def advance_at_index(self, next_token_id, fsm_grammar_state, index):
+        if self.fsms[index] is None:
+            return fsm_grammar_state
+        return GrammarLogitProcessor._advance(
+            next_token_id, fsm_grammar_state, self.fsms[index]
+        )
+
+    def filter(self, indices):
+        new_fsms = []
+        for i in indices:
+            new_fsms.append(self.fsms[i])
+        self.fsms = new_fsms
+        return self
diff --git a/server/text_generation_server/utils/merges/strategies.py b/server/text_generation_server/utils/merges/strategies.py
new file mode 100644
index 00000000..3b885313
--- /dev/null
+++ b/server/text_generation_server/utils/merges/strategies.py
@@ -0,0 +1,223 @@
+import copy
+from abc import ABC
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, List, Tuple, Type, Union
+
+import torch
+
+
+class AdapterParameters:
+    def __init__(
+        self, adapter_ids, weights, merge_strategy, density, majority_sign_method
+    ):
+        self.adapter_ids = adapter_ids
+        self.weights = weights
+        self.merge_strategy = merge_strategy
+        self.density = density
+        self.majority_sign_method = majority_sign_method
+
+
+from text_generation_server.utils.merges.utils import (
+    calculate_majority_sign_mask,
+    disjoint_merge,
+    prune,
+)
+
+if TYPE_CHECKING:
+    from text_generation_server.adapters.lora import LoraConfig
+    from text_generation_server.utils.adapter import ModuleMap
+
+
+def _apply_weights(
+    tensors: Union[torch.Tensor, List[torch.Tensor]], w: torch.Tensor
+) -> torch.Tensor:
+    if isinstance(tensors, torch.Tensor):
+        t = tensors
+    else:
+        t = torch.stack(tensors, dim=0)
+
+    # element-wise weighting of each task tensor
+    # need to unsqueeze weights to match task tensor dimensions
+    # for multiplication to apply element-wise
+    while len(t.shape) > len(w.shape):
+        w = w.unsqueeze(-1)
+    return t * w
+
+
+class MergeStrategy(ABC):
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        raise NotImplementedError()
+
+
+class LinearMerge(MergeStrategy):
+    def __init__(self, **kwargs):
+        pass
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+        return weighted_task_tensors.sum(dim=0)
+
+
+class TiesMerge(MergeStrategy):
+    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
+        self.density = density
+        self.majority_sign_method = majority_sign_method
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        # sparsify
+        task_tensors = [
+            prune(tensor, self.density, method="magnitude") for tensor in task_tensors
+        ]
+        task_tensors = torch.stack(task_tensors, dim=0)
+
+        # elect sign before applying weights
+        majority_sign_mask = calculate_majority_sign_mask(
+            task_tensors, method=self.majority_sign_method
+        )
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+
+        # disjoint merge
+        return disjoint_merge(weighted_task_tensors, majority_sign_mask)
+
+
+class DareLinearMerge(MergeStrategy):
+    def __init__(self, density: float, **kwargs):
+        self.density = density
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        # sparsify
+        task_tensors = [
+            prune(tensor, self.density, method="random", rescale=True)
+            for tensor in task_tensors
+        ]
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+        return weighted_task_tensors.sum(dim=0)
+
+
+class DareTiesMerge(MergeStrategy):
+    def __init__(self, density: float, majority_sign_method: str = "total", **kwargs):
+        self.density = density
+        self.majority_sign_method = majority_sign_method
+
+    def merge(
+        self, task_tensors: List[torch.Tensor], weights: torch.Tensor
+    ) -> torch.Tensor:
+        # sparsify
+        task_tensors = [
+            prune(tensor, self.density, method="random", rescale=True)
+            for tensor in task_tensors
+        ]
+        task_tensors = torch.stack(task_tensors, dim=0)
+
+        # elect sign before applying weights
+        majority_sign_mask = calculate_majority_sign_mask(
+            task_tensors, method=self.majority_sign_method
+        )
+        weighted_task_tensors = _apply_weights(task_tensors, weights)
+
+        # disjoint merge
+        mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
+        return mixed_task_tensors
+
+
+strategy_registry: Dict[str, Type[MergeStrategy]] = {
+    "linear": LinearMerge,
+    "ties": TiesMerge,
+    "dare_linear": DareLinearMerge,
+    "dare_ties": DareTiesMerge,
+}
+
+
+def merge_adapters(
+    adapters: List[Tuple["ModuleMap", "LoraConfig"]],
+    merge_params: AdapterParameters,
+) -> Tuple["ModuleMap", "LoraConfig"]:
+    # strategy_name = MergeStrategyEnum.Name(merge_params.merge_strategy).lower()
+    strategy_name = "linear"
+
+    weights = merge_params.weights
+    if not weights:
+        weights = torch.ones(len(adapters))
+    else:
+        weights = torch.tensor(weights)
+
+    merge_config = {
+        "density": merge_params.density,
+        # "majority_sign_method": MajoritySignMethodEnum.Name(
+        #     merge_params.majority_sign_method
+        # ).lower(),
+        "majority_sign_method": "total",
+    }
+    merge_strategy = strategy_registry[strategy_name](**merge_config)
+
+    module_maps: Dict[str, Dict[str, Dict[str, List[torch.Tensor]]]] = defaultdict(
+        lambda: defaultdict(lambda: defaultdict(list))
+    )
+    lora_configs = []
+    weight_name_to_adapter_idx = defaultdict(list)
+
+    # input is list of (module_map, lora_config) tuples
+    # convert into dict[k][param_name] -> list of tensors
+    for idx, (module_map, lora_config) in enumerate(adapters):
+        for weight_name, data in module_map.items():
+            weight_name_to_adapter_idx[weight_name].append(idx)
+            for k, (param_data, param_name) in data.items():
+                module_maps[weight_name][k][param_name].append(param_data)
+        lora_configs.append(lora_config)
+
+    # validate lora configs are compatible
+    _validate_lora_configs(lora_configs)
+
+    # merge tensors for each module such that we have a single ModuleMap:
+    # dict[k] -> merged tensor
+    merged_module_map: "ModuleMap" = defaultdict(dict)
+    for weight_name, data in module_maps.items():
+        indices = weight_name_to_adapter_idx[weight_name]
+        param_weights = weights[indices]
+        for k, param_data in data.items():
+            for param_name, tensors in param_data.items():
+                merged_tensor = merge_strategy.merge(tensors, param_weights)
+                merged_module_map[weight_name][k] = (merged_tensor, param_name)
+
+    # merge lora configs
+    merged_lora_config = _merge_lora_configs(lora_configs)
+
+    return merged_module_map, merged_lora_config
+
+
+def _validate_lora_configs(lora_configs: List["LoraConfig"]):
+    # check that all configs have the same rank
+    ranks = set(lora_config.r for lora_config in lora_configs)
+    if len(ranks) > 1:
+        raise ValueError(
+            f"unable to merge adapters, lora configs have different ranks: {ranks}"
+        )
+
+    if all(len(lora_config.target_modules) == 0 for lora_config in lora_configs):
+        raise ValueError(
+            "unable to merge adapters, lora configs have no target modules"
+        )
+
+
+def _merge_lora_configs(lora_configs: List["LoraConfig"]) -> "LoraConfig":
+    merged_lora_config = copy.copy(lora_configs[0])
+
+    # merge target modules as a union operation
+    merged_target_modules = sorted(
+        set(
+            module
+            for lora_config in lora_configs
+            for module in lora_config.target_modules
+        )
+    )
+    merged_lora_config.target_modules = merged_target_modules
+
+    return merged_lora_config
diff --git a/server/text_generation_server/utils/merges/utils.py b/server/text_generation_server/utils/merges/utils.py
new file mode 100644
index 00000000..d9ad3278
--- /dev/null
+++ b/server/text_generation_server/utils/merges/utils.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# From: https://github.com/huggingface/peft/pull/1364
+# Copyright 2024-present the HuggingFace Inc. team.
+# Modifications by Predibase, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+import torch
+
+
+def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor:
+    """
+    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
+    `density`.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to prune.
+    density (`float`):The fraction of values to preserve. Should be in [0,1].
+    """
+    mask = torch.zeros_like(tensor).reshape(-1)
+    k = int(density * tensor.reshape(-1).shape[0])
+    top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True)
+    mask[top_k[1]] = 1
+    return tensor * mask.reshape(tensor.shape)
+
+
+def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
+    """
+    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
+    `density`.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to prune.
+    density (`float`):The fraction of values to preserve. Should be in [0,1].
+    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
+    """
+    mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density))
+    pruned_tensor = tensor * mask
+    if rescale:
+        torch.div(input=pruned_tensor, other=density)
+    return pruned_tensor
+
+
+def prune(
+    tensor: torch.Tensor,
+    density: float,
+    method: Literal["magnitude", "random"],
+    rescale: bool = False,
+) -> torch.Tensor:
+    """
+    Prune the values of task tensors based on the `method`.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to prune.
+    density (`float`):The fraction of values to preserve. Should be in [0,1].
+    method (`str`):The method to use to prune. Should be one of ["magnitude", "random"].
+    rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
+    """
+    if density >= 1:
+        return tensor
+    elif density < 0:
+        raise ValueError("Density should be >= 0, got {density}")
+    if method == "magnitude":
+        return magnitude_based_pruning(tensor, density)
+    elif method == "random":
+        return random_pruning(tensor, density, rescale=rescale)
+    else:
+        raise ValueError(f"Unknown method {method}")
+
+
+def calculate_majority_sign_mask(
+    tensor: torch.Tensor, method: Literal["total", "frequency"] = "total"
+):
+    """
+    Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0.
+
+    Args:
+    tensor (`torch.Tensor`):The tensor to get the mask from.
+    method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"].
+    """
+
+    sign = tensor.sign()
+    if method == "total":
+        sign_magnitude = (sign * tensor.abs()).sum(dim=0)
+    elif method == "frequency":
+        sign_magnitude = sign.sum(dim=0)
+    else:
+        raise RuntimeError(f'Unimplemented mask method "{method}"')
+    majority_sign = torch.where(sign_magnitude >= 0, 1, -1)
+    return sign == majority_sign
+
+
+def disjoint_merge(task_tensors, majority_sign_mask):
+    mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0)
+    num_params_preserved = majority_sign_mask.sum(dim=0)
+    return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0)
diff --git a/server/text_generation_server/utils/peft.py b/server/text_generation_server/utils/peft.py
index be1f9444..0ea89267 100644
--- a/server/text_generation_server/utils/peft.py
+++ b/server/text_generation_server/utils/peft.py
@@ -1,16 +1,16 @@
 import os
-import json
+from typing import Union
 from loguru import logger
 import torch
 
 from transformers import AutoTokenizer
 from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
 
+
 def download_and_unload_peft(model_id, revision, trust_remote_code):
     torch_dtype = torch.float16
 
-    logger.info("Peft model detected.")
-    logger.info("Loading the model it might take a while without feedback")
+    logger.info("Trying to load a Peft model. It might take a while without feedback")
     try:
         model = AutoPeftModelForCausalLM.from_pretrained(
             model_id,
@@ -27,20 +27,42 @@ def download_and_unload_peft(model_id, revision, trust_remote_code):
             trust_remote_code=trust_remote_code,
             low_cpu_mem_usage=True,
         )
-    logger.info(f"Loaded.")
+    logger.info("Peft model detected.")
     logger.info(f"Merging the lora weights.")
 
     base_model_id = model.peft_config["default"].base_model_name_or_path
 
     model = model.merge_and_unload()
-    
+
     os.makedirs(model_id, exist_ok=True)
     cache_dir = model_id
     logger.info(f"Saving the newly created merged model to {cache_dir}")
-    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_id, trust_remote_code=trust_remote_code
+    )
     model.save_pretrained(cache_dir, safe_serialization=True)
     model.config.save_pretrained(cache_dir)
     tokenizer.save_pretrained(cache_dir)
 
 
-
+def download_peft(
+    model_id: Union[str, os.PathLike], revision: str, trust_remote_code: bool
+):
+    torch_dtype = torch.float16
+    try:
+        _model = AutoPeftModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=True,
+        )
+    except Exception:
+        _model = AutoPeftModelForSeq2SeqLM.from_pretrained(
+            model_id,
+            revision=revision,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=True,
+        )
+    logger.info("Peft model downloaded.")
diff --git a/server/text_generation_server/utils/segments.py b/server/text_generation_server/utils/segments.py
new file mode 100644
index 00000000..f5961102
--- /dev/null
+++ b/server/text_generation_server/utils/segments.py
@@ -0,0 +1,66 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/utils/segments.py
+# License:  Apache License Version 2.0, January 2004
+
+from typing import List, Tuple, Union
+
+import torch
+
+
+def find_segments(
+    adapter_indices: Union[torch.Tensor, List[int]]
+) -> Tuple[List[int], List[int]]:
+    segments = [0]
+    segment_indices = []
+
+    if isinstance(adapter_indices, torch.Tensor):
+        # Calling .item() repeatedly on CUDA tensor is very slow, so we move it to CPU first
+        adapter_indices = adapter_indices.cpu().tolist()
+
+    start_index = 0
+    for i in range(1, len(adapter_indices)):
+        if adapter_indices[i] != adapter_indices[i - 1]:
+            segments.append(i)
+            segment_indices.append(adapter_indices[i - 1])
+            start_index = i
+
+    # Handle the last segment
+    if start_index < len(adapter_indices):
+        segments.append(len(adapter_indices))
+        segment_indices.append(adapter_indices[-1])
+
+    return segments, segment_indices
+
+
+class SegmentConcatBuilder:
+    def __init__(self):
+        self.adapter_segment_indices = []
+        self.adapter_segment_tensors = []
+
+    def concat(self, adapter_segments: torch.Tensor, segment_indices: List[int]):
+        # Update adapter segments
+        if self.adapter_segment_tensors:
+            # Because we have already processed at least one batch, remove the 0 start index
+            # from this batch denoting the beginning of the segment, then offset all segment
+            # positions by the value of the last segment in the previous batch to account for
+            # the concatenation.
+            adapter_segments = (
+                adapter_segments[1:] + self.adapter_segment_tensors[-1][-1]
+            )
+
+        if (
+            self.adapter_segment_indices
+            and self.adapter_segment_indices[-1] == segment_indices[0]
+        ):
+            # If the last segment in the previous batch is the same as the first segment in this batch,
+            # then we merge them together into a single segment. In effect, this means removing it from
+            # the segment indices of this batch, and extending the segment span by removing the segment
+            # end index from the previous batch.
+            segment_indices = segment_indices[1:]
+            self.adapter_segment_tensors[-1] = self.adapter_segment_tensors[-1][:-1]
+
+        self.adapter_segment_indices.extend(segment_indices)
+        self.adapter_segment_tensors.append(adapter_segments)
+
+    def build(self) -> Tuple[torch.Tensor, List[int]]:
+        return torch.concat(self.adapter_segment_tensors), self.adapter_segment_indices
diff --git a/server/text_generation_server/utils/sgmv.py b/server/text_generation_server/utils/sgmv.py
new file mode 100644
index 00000000..e0aec25f
--- /dev/null
+++ b/server/text_generation_server/utils/sgmv.py
@@ -0,0 +1,248 @@
+# Origin:   https://github.com/predibase/lorax
+# Path:     lorax/server/lorax_server/utils/sgmv.py
+# License:  Apache License Version 2.0, January 2004
+
+import os
+import warnings
+from functools import lru_cache
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+
+try:
+    import punica_kernels as _kernels
+
+    HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", ""))
+except ImportError:
+    warnings.warn("Could not import SGMV kernel from Punica, falling back to loop.")
+    _kernels = None
+    HAS_SGMV = False
+
+
+MIN_SGMV_RANK = 8
+MIN_RANK_CUSTOM = 16
+MAX_RANK_CUSTOM = 128
+SGMV_BLOCK_SIZE = 16
+BGMV_MAX_RANK = 64
+
+
+def has_sgmv() -> bool:
+    return HAS_SGMV
+
+
+def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
+    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
+    if not has_sgmv():
+        return t
+
+    # tensor parallelism will result in effective rank being divided by world_size,
+    # so we need to scale the min rank to offset that effect
+    min_rank = MIN_SGMV_RANK * world_size
+
+    # if we're at or below the min rank, pad up to the min rank
+    # otherwise, pad to the nearest multiple of the block size
+    current_rank = t.size(dim)
+    target_rank = (
+        min_rank
+        if current_rank <= min_rank
+        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
+    )
+    if current_rank == target_rank:
+        return t
+
+    pad_size = target_rank - current_rank
+
+    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
+    pad = [0, 0] * t.dim()
+    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
+    pad = tuple(pad)
+
+    return F.pad(t, pad, mode="constant", value=0.0)
+
+
+def use_cutlass_shrink(lora_rank: int) -> bool:
+    return lora_rank < MIN_RANK_CUSTOM
+
+
+def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
+    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
+        return t.transpose(0, 1)
+    return t
+
+
+# Source: https://github.com/punica-ai/punica/blob/master/src/punica/ops/__init__.py
+def add_lora_sgmv_cutlass(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.Tensor,
+    s_end: torch.Tensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    """
+    Semantics:
+        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])
+
+    Args:
+        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+        x: Shape: `[B, H1]`. Input vectors.
+        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H1]`.
+        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
+            Weight matrix shape: `[num_layers, R, H2]`.
+        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
+        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
+        layer_idx: Layer index of the weight matrices.
+    """
+    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
+        # Custom SGMV shrink only supports rank 16, 32, 64, 128
+        _add_lora_sgmv_cutlass_legacy(
+            y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank
+        )
+        return
+
+    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
+    tmp2_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
+    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)
+
+
+def _add_lora_sgmv_cutlass_legacy(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+):
+    tmp_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
+    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+
+
+@lru_cache(maxsize=1)
+def get_tmp_tensor(device: torch.device) -> torch.Tensor:
+    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)
+
+
+@lru_cache(maxsize=32)
+def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
+    tmp_size = _kernels.sgmv_cutlass_tmp_size(size)
+    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
+
+
+def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
+    return torch.empty((size,), dtype=torch.uint8, device=device)
+
+
+def get_tmp_expand_size(size: int) -> int:
+    return _kernels.sgmv_cutlass_tmp_size(size)
+
+
+def get_tmp_tensors(
+    nsegments: int, lora_rank: int, device: torch.device
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if use_cutlass_shrink(lora_rank) and has_sgmv():
+        tmp = get_tmp_tensor_for_size(nsegments, device)
+        return tmp, tmp
+    else:
+        tmp_shrink = get_tmp_tensor(device)
+        tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)
+        return tmp_shrink, tmp_expand
+
+
+def lora_a_sgmv_cutlass(
+    x: torch.Tensor,
+    tmp: torch.Tensor,
+    wa_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+    lora_rank: int,
+) -> torch.Tensor:
+    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
+    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
+        _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    else:
+        _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
+    return v
+
+
+def lora_b_sgmv_cutlass(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    tmp: torch.Tensor,
+    wb_ptr: torch.Tensor,
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+    layer_idx: int,
+):
+    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)
+
+
+"""
+Semantics:
+    y[i] += (
+        x[i].unsqueeze(0)
+        @ wa_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+        @ wb_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+        * scale
+    ).squeeze(0)
+
+Args:
+    y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+    v: Shape: `[B, R]`. Temporary vector.
+    x: Shape: `[B, H1]`. Input vectors.
+    wa_T_all: Shape: `[None, L, R, H1]`. All of the transposed LoRA A matrices.
+    wb_T_all: Shape: `[None, L, H2, R]`. All of the transposed LoRA B matrices.
+    indicies: Shape: `[B]`. Indices of the LoRA weights.
+    layer_idx: Layer index of LoRA weights.
+    scale: Scaling factor.
+"""
+
+
+def add_lora_a_bgmv(
+    v: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    _kernels.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)
+
+
+def add_lora_b_bgmv(
+    y: torch.Tensor,
+    v: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+):
+    _kernels.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)
+
+
+def segmented_matmul(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w: List[torch.Tensor],
+    b: List[torch.Tensor],
+    s_start: torch.IntTensor,
+    s_end: torch.IntTensor,
+):
+    for i in range(len(w)):
+        if s_end[i] - s_start[i] <= 0:
+            continue
+
+        xi = x[s_start[i] : s_end[i]]
+        wi = w[i]
+        bi = b[i]
+        y[s_start[i] : s_end[i]] = F.linear(xi, wi, bi)
diff --git a/server/text_generation_server/utils/speculate.py b/server/text_generation_server/utils/speculate.py
new file mode 100644
index 00000000..a1b37a34
--- /dev/null
+++ b/server/text_generation_server/utils/speculate.py
@@ -0,0 +1,11 @@
+SPECULATE = None
+
+
+def get_speculate() -> int:
+    global SPECULATE
+    return SPECULATE
+
+
+def set_speculate(speculate: int):
+    global SPECULATE
+    SPECULATE = speculate
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index 69177d56..22f86b60 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -1,16 +1,21 @@
 import re
-from typing import Callable, List, Optional, Tuple
+from typing import List, Optional, Tuple, Set, Union
 
+import math
 import torch
 from text_generation_server.pb import generate_pb2
-from text_generation_server.pb.generate_pb2 import FinishReason
+from text_generation_server.pb.generate_pb2 import FinishReason, GrammarType
 from text_generation_server.utils.logits_process import (
+    FrequencyPenaltyLogitsProcessor,
+    GrammarLogitProcessor,
     HeterogeneousProcessorWrapper,
     HeterogeneousRepetitionPenaltyLogitsProcessor,
+    HeterogeneousFrequencyPenaltyLogitsProcessor,
     HeterogeneousTemperatureLogitsWarper,
     HeterogeneousTopKLogitsWarper,
     HeterogeneousTopPLogitsWarper,
     HeterogeneousTypicalLogitsWarper,
+    HeterogeneousGrammarLogitProcessor,
     static_warper,
 )
 from text_generation_server.utils.watermark import WatermarkLogitsProcessor
@@ -20,24 +25,40 @@ from transformers import PreTrainedTokenizerBase, RepetitionPenaltyLogitsProcess
 class NextTokenChooser:
     def __init__(
         self,
-        watermark=False,
-        temperature=1.0,
-        repetition_penalty=1.0,
-        top_k=None,
-        top_p=None,
-        typical_p=None,
-        do_sample=False,
-        seed=0,
-        device="cpu",
+        watermark: bool = False,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.0,
+        frequency_penalty: float = 0.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        do_sample: bool = False,
+        seed: int = 0,
+        device: str = "cpu",
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        grammar: str = "",
+        grammar_type: GrammarType = GrammarType.GRAMMAR_TYPE_NONE,
+        fsm_grammar_state: int = 0,
     ):
         self.watermark_processor = (
             WatermarkLogitsProcessor(device=device) if watermark else None
         )
         self.repetition_processor = (
             RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)
-            if repetition_penalty
+            if repetition_penalty and repetition_penalty != 1.0
             else None
         )
+        self.frequency_processor = (
+            FrequencyPenaltyLogitsProcessor(penalty=frequency_penalty)
+            if frequency_penalty and frequency_penalty != 0.0
+            else None
+        )
+        self.grammar_processor = (
+            GrammarLogitProcessor(tokenizer, device, grammar, grammar_type)
+            if grammar != ""
+            else None
+        )
+        self.tokenizer = tokenizer
 
         has_warpers = (
             (temperature is not None and temperature != 1.0)
@@ -53,13 +74,20 @@ class NextTokenChooser:
             self.static_warper = None
 
         sampling = do_sample or has_warpers
+
         self.choice = Sampling(seed, device) if sampling else Greedy()
+        self.fsm_grammar_state = fsm_grammar_state
+        self.grammar = grammar
 
     def __call__(self, input_ids, scores):
         if self.watermark_processor is not None:
             scores = self.watermark_processor(input_ids, scores)
         if self.repetition_processor is not None:
             scores = self.repetition_processor(input_ids, scores)
+        if self.frequency_processor is not None:
+            scores = self.frequency_processor(input_ids, scores)
+        if self.grammar_processor is not None:
+            scores = self.grammar_processor(scores, self.fsm_grammar_state)
 
         if self.static_warper is None:
             next_logprob = torch.log_softmax(scores, -1)
@@ -70,29 +98,41 @@ class NextTokenChooser:
 
         return next_id, next_logprob
 
+    def advance_grammar(self, next_id: int):
+        if self.grammar_processor is not None:
+            self.fsm_grammar_state = self.grammar_processor.advance(
+                next_id, self.fsm_grammar_state
+            )
+        return self
+
     @classmethod
     def from_pb(
         cls,
         pb: generate_pb2.NextTokenChooserParameters,
         device: torch.device,
+        tokenizer: PreTrainedTokenizerBase,
     ) -> "NextTokenChooser":
         return NextTokenChooser(
             watermark=pb.watermark,
             temperature=pb.temperature,
             repetition_penalty=pb.repetition_penalty,
+            frequency_penalty=pb.frequency_penalty,
             top_k=pb.top_k,
             top_p=pb.top_p,
             typical_p=pb.typical_p,
             do_sample=pb.do_sample,
             seed=pb.seed,
             device=device,
+            tokenizer=tokenizer,
+            grammar=pb.grammar,
+            grammar_type=pb.grammar_type,
         )
 
 
 class StopSequenceCriteria:
     def __init__(self, stop_sequence: str):
         stop_sequence = re.escape(stop_sequence)
-        self.regex = re.compile(f".*{stop_sequence}$")
+        self.regex = re.compile(f"{stop_sequence}$")
 
     def __call__(self, output: str) -> bool:
         if self.regex.findall(output):
@@ -103,12 +143,22 @@ class StopSequenceCriteria:
 class StoppingCriteria:
     def __init__(
         self,
-        eos_token_id: int,
+        eos_token_ids: Optional[Union[Set[int], int]],
         stop_sequence_criterias: List[StopSequenceCriteria],
         max_new_tokens: int = 20,
         ignore_eos_token: bool = False,
     ):
-        self.eos_token_id = eos_token_id
+        if eos_token_ids is None:
+            eos_token_ids = set()
+        elif isinstance(eos_token_ids, int):
+            eos_token_ids = set([eos_token_ids])
+        elif isinstance(eos_token_ids, set):
+            eos_token_ids = eos_token_ids
+        else:
+            raise RuntimeError(
+                f"eos_token_ids is of invalid type {type(eos_token_ids)}, expected int, None or set[int]"
+            )
+        self.eos_token_ids = eos_token_ids
         self.stop_sequence_criterias = stop_sequence_criterias
         self.max_new_tokens = max_new_tokens
         self.current_tokens = 0
@@ -120,13 +170,21 @@ class StoppingCriteria:
         if self.current_tokens >= self.max_new_tokens:
             return True, FinishReason.FINISH_REASON_LENGTH
 
-        if not self.ignore_eos_token and last_token == self.eos_token_id:
+        if isinstance(last_token, torch.Tensor):
+            last_token = last_token.item()
+
+        if not self.ignore_eos_token and last_token in self.eos_token_ids:
             return True, FinishReason.FINISH_REASON_EOS_TOKEN
 
-        self.current_output += last_output
-        for stop_sequence_criteria in self.stop_sequence_criterias:
-            if stop_sequence_criteria(self.current_output):
-                return True, FinishReason.FINISH_REASON_STOP_SEQUENCE
+        if self.stop_sequence_criterias:
+            self.current_output += last_output
+            # There is no need to keep an output that is too long
+            if len(self.current_output) > 300:
+                # Slice to -200 to avoid doing it all the time
+                self.current_output = self.current_output[-200:]
+            for stop_sequence_criteria in self.stop_sequence_criterias:
+                if stop_sequence_criteria(self.current_output):
+                    return True, FinishReason.FINISH_REASON_STOP_SEQUENCE
 
         return False, None
 
@@ -139,14 +197,40 @@ class StoppingCriteria:
         stop_sequence_criterias = [
             StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
         ]
+        # TODO Hack because eos_token_id cannot be what we want.
+        eos_token_id = getattr(tokenizer, "_eos_token_ids", tokenizer.eos_token_id)
         return StoppingCriteria(
-            tokenizer.eos_token_id,
+            eos_token_id,
             stop_sequence_criterias,
             pb.max_new_tokens,
             pb.ignore_eos_token,
         )
 
 
+def create_n_gram_speculation(
+    input_ids: torch.Tensor,
+    next_ids: torch.Tensor,
+    accepted_ids: torch.Tensor,
+    speculate: int,
+    verbose: bool,
+):
+    # Very trivial approach, find first match in the string.
+    # This is much less refined than actual n-gram but seems to work
+    # relatively OK in grounded mode and is by far much faster with
+    # much less worst case complexity as everything happens on device.
+    B = accepted_ids.shape[0]
+    device = input_ids.device
+    seeds = next_ids[accepted_ids.cumsum(dim=-1) - 1]
+    indices = (input_ids == seeds.unsqueeze(-1)).max(dim=1).indices + 1
+    all_indices = indices.unsqueeze(-1).expand(B, speculate) + torch.arange(
+        speculate, device=device
+    )
+    all_indices = torch.clamp(all_indices, max=input_ids.shape[1] - 1)
+
+    speculative_ids = input_ids.gather(dim=-1, index=all_indices)
+    return speculative_ids
+
+
 class HeterogeneousNextTokenChooser:
     def __init__(
         self,
@@ -155,11 +239,16 @@ class HeterogeneousNextTokenChooser:
         watermark: List[bool],
         temperature: List[float],
         repetition_penalty: List[float],
+        frequency_penalty: List[float],
         top_k: List[int],
         top_p: List[float],
         typical_p: List[float],
         do_sample: List[bool],
         seeds: List[int],
+        tokenizer: PreTrainedTokenizerBase,
+        grammars: List[str],
+        grammar_types: List[int],
+        fsm_grammar_states=List[int],
     ):
         warpers = []
 
@@ -183,7 +272,23 @@ class HeterogeneousNextTokenChooser:
             else None
         )
 
-        if any([x != 1.0 for x in temperature]):
+        self.frequency_processor = (
+            HeterogeneousFrequencyPenaltyLogitsProcessor(
+                frequency_penalty, dtype, device
+            )
+            if any([x != 0.0 for x in frequency_penalty])
+            else None
+        )
+
+        self.grammar_processor = (
+            HeterogeneousGrammarLogitProcessor(
+                tokenizer, device, grammars, grammar_types
+            )
+            if any([grammar != "" for grammar in grammars])
+            else None
+        )
+
+        if any(x != 1.0 for x in temperature):
             do_sample = [
                 sample or x != 1.0 for x, sample in zip(temperature, do_sample)
             ]
@@ -191,15 +296,15 @@ class HeterogeneousNextTokenChooser:
                 HeterogeneousTemperatureLogitsWarper(temperature, dtype, device)
             )
 
-        if any([x != 0 for x in top_k]):
+        if any(x != 0 for x in top_k):
             do_sample = [sample or x != 0 for x, sample in zip(top_k, do_sample)]
             warpers.append(HeterogeneousTopKLogitsWarper(top_k, device))
 
-        if any([x < 1.0 for x in top_p]):
+        if any(x < 1.0 for x in top_p):
             do_sample = [sample or x < 1.0 for x, sample in zip(top_p, do_sample)]
             warpers.append(HeterogeneousTopPLogitsWarper(top_p, dtype, device))
 
-        if any([x < 1.0 for x in typical_p]):
+        if any(x < 1.0 for x in typical_p):
             do_sample = [sample or x < 1.0 for x, sample in zip(typical_p, do_sample)]
             warpers.append(HeterogeneousTypicalLogitsWarper(typical_p, dtype, device))
 
@@ -214,21 +319,118 @@ class HeterogeneousNextTokenChooser:
         self.do_sample = do_sample
         self.dtype = dtype
         self.device = device
+        self.tokenizer = tokenizer
+        self.fsm_grammar_states = fsm_grammar_states
+        self.grammars = grammars
+        self.grammar_types = grammar_types
 
-    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor):
-        if self.watermark_processor is not None:
-            scores = self.watermark_processor(input_ids, scores)
-        if self.repetition_processor is not None:
-            scores = self.repetition_processor(input_ids, scores)
+    def __call__(
+        self,
+        input_ids: torch.Tensor,
+        scores: torch.Tensor,
+        speculate: int,
+        speculated_ids: Optional[torch.Tensor] = None,
+        speculative_scores: Optional[torch.Tensor] = None,
+        verbose=False,
+    ):
+        if speculated_ids is not None:
+            B = scores.shape[0] // (speculated_ids.shape[1] + 1)
+            S = speculated_ids.shape[1] + 1
+            scores = scores.view(B, S, -1)
+        else:
+            B = scores.shape[0]
+            S = 1
+            scores = scores.view(B, S, -1)
 
-        for warper in self.warpers:
-            scores = warper(input_ids, scores)
+        next_ids = torch.zeros((B, S), device=scores.device, dtype=torch.long)
+
+        for j in range(S):
+            _scores = scores[:, j]
+            if self.watermark_processor is not None:
+                _scores = self.watermark_processor(input_ids, _scores)
+            if self.repetition_processor is not None:
+                _scores = self.repetition_processor(input_ids, _scores)
+            if self.frequency_processor is not None:
+                _scores = self.frequency_processor(input_ids, _scores)
+            if self.grammar_processor is not None:
+                _scores = self.grammar_processor(_scores, self.fsm_grammar_states)
+            for warper in self.warpers:
+                _scores = warper(input_ids, _scores)
+            _next_ids = self.choice(_scores)
+            scores[:, j] = _scores
+            next_ids[:, j] = _next_ids
+        next_ids = next_ids.view(B * S)
+        allscores = scores.view(B * S, -1)
+        alllogprobs = torch.log_softmax(allscores, -1)
+
+        if speculated_ids is not None:
+            accepted_ids = []
+            B = next_ids.shape[0] // (speculated_ids.shape[1] + 1)
+            S = speculated_ids.shape[1] + 1
+            indices = []
+            for i in range(B):
+                _next_ids = next_ids[i * S : (i + 1) * S]
+                _speculated_ids = speculated_ids[i]
+                validate_speculative = _next_ids[:-1] == _speculated_ids
+                index = i * S
+                accepted = 1
+                # First is always valid
+                indices.append(index)
+                for valid in validate_speculative.tolist():
+                    if valid:
+                        index += 1
+                        accepted += 1
+                        indices.append(index)
+                    else:
+                        break
+                accepted_ids.append(accepted)
+
+            accepted_ids = torch.tensor(
+                accepted_ids, device=input_ids.device, dtype=input_ids.dtype
+            )
+            next_ids = next_ids[indices]
+            logprobs = alllogprobs[indices]
+            indices = torch.arange(B, device=input_ids.device) * S
+            if speculative_scores is not None:
+                speculative_scores = speculative_scores[indices + accepted_ids - 1]
+        else:
+            accepted_ids = torch.ones_like(next_ids)
+            logprobs = alllogprobs
 
-        next_ids = self.choice(scores)
-        logprobs = torch.log_softmax(scores, -1)
         next_logprobs = torch.gather(logprobs, 1, next_ids.view(-1, 1)).view(-1)
 
-        return next_ids, next_logprobs, logprobs
+        if speculate > 0:
+            if speculative_scores is not None:
+                # Medusa provided some scores
+                speculative_ids = Greedy()(speculative_scores)
+            else:
+                # n-gram
+                speculative_ids = create_n_gram_speculation(
+                    input_ids, next_ids, accepted_ids, speculate, verbose
+                )
+        else:
+            speculative_ids = None
+
+        return next_ids, next_logprobs, alllogprobs, accepted_ids, speculative_ids
+
+    def advance_grammar(self, next_ids: List[int]):
+        if self.grammar_processor is not None:
+            other_new_states = self.grammar_processor.advance_batch(
+                next_ids, self.fsm_grammar_states
+            )
+            self.fsm_grammar_states = other_new_states
+        return self
+
+    def advance_grammar_single(self, grammar_state_index: int, next_id: int):
+        if self.grammar_processor is not None:
+            self.fsm_grammar_states[grammar_state_index] = (
+                self.grammar_processor.advance_at_index(
+                    next_id,
+                    self.fsm_grammar_states[grammar_state_index],
+                    grammar_state_index,
+                )
+            )
+        return self
 
     def filter(self, indices):
         if self.watermark_processor is not None:
@@ -237,6 +439,12 @@ class HeterogeneousNextTokenChooser:
         if self.repetition_processor is not None:
             self.repetition_processor = self.repetition_processor.filter(indices)
 
+        if self.frequency_processor is not None:
+            self.frequency_processor = self.frequency_processor.filter(indices)
+
+        if self.grammar_processor is not None:
+            self.grammar_processor = self.grammar_processor.filter(indices)
+
         filtered_warpers = []
         for warper in self.warpers:
             filtered_warper = warper.filter(indices)
@@ -247,6 +455,18 @@ class HeterogeneousNextTokenChooser:
         self.seeds = [self.seeds[i] for i in indices]
         self.do_sample = [self.do_sample[i] for i in indices]
 
+        new_grammars = []
+        new_fsm_grammar_states = []
+        new_grammar_types = []
+        for i in indices:
+            new_grammars.append(self.grammars[i])
+            new_fsm_grammar_states.append(self.fsm_grammar_states[i])
+            new_grammar_types.append(self.grammar_types[i])
+
+        self.grammars = new_grammars
+        self.fsm_grammar_states = new_fsm_grammar_states
+        self.grammar_types = new_grammar_types
+
         if any(self.do_sample):
             self.choice.filter(indices)
         else:
@@ -260,11 +480,14 @@ class HeterogeneousNextTokenChooser:
         pb: List[generate_pb2.NextTokenChooserParameters],
         dtype: torch.dtype,
         device: torch.device,
+        tokenizer: PreTrainedTokenizerBase,
+        fsm_grammar_states: Optional[List[int]] = None,
     ) -> "HeterogeneousNextTokenChooser":
         return HeterogeneousNextTokenChooser(
             watermark=[pb_.watermark for pb_ in pb],
             temperature=[pb_.temperature for pb_ in pb],
             repetition_penalty=[pb_.repetition_penalty for pb_ in pb],
+            frequency_penalty=[pb_.frequency_penalty for pb_ in pb],
             top_k=[pb_.top_k for pb_ in pb],
             top_p=[pb_.top_p for pb_ in pb],
             typical_p=[pb_.typical_p for pb_ in pb],
@@ -272,6 +495,12 @@ class HeterogeneousNextTokenChooser:
             seeds=[pb_.seed for pb_ in pb],
             device=device,
             dtype=dtype,
+            tokenizer=tokenizer,
+            grammars=[pb_.grammar for pb_ in pb],
+            grammar_types=[pb_.grammar_type for pb_ in pb],
+            fsm_grammar_states=(
+                fsm_grammar_states if fsm_grammar_states else [0] * len(pb)
+            ),
         )
 
 
@@ -337,8 +566,11 @@ class HeterogeneousSampling:
 
 
 def batch_top_tokens(
-    top_n_tokens: list[int], top_n_tokens_tensor: torch.Tensor, logprobs: torch.Tensor
-) -> Tuple[List[List[int]], List[List[float]]]:
+    top_n_tokens: List[int],
+    top_n_tokens_tensor: torch.Tensor,
+    logprobs: torch.Tensor,
+    accepted_ids: torch.Tensor,
+) -> Tuple[List[List[List[int]]], List[List[List[float]]]]:
     """Find the top n most likely tokens for a batch of generations.
 
     When multiple tokens have equal probabilities and they don't all fit, the
@@ -347,14 +579,22 @@ def batch_top_tokens(
     max_top_n = max(top_n_tokens)
     # Early exit when top_n_tokens is not used
     if max_top_n == 0:
-        return [[]] * len(top_n_tokens), [[]] * len(top_n_tokens)
+        return [[[]]] * len(top_n_tokens), [[[]]] * len(top_n_tokens)
 
+    batch_size = accepted_ids.shape[0]
+    speculate_size = logprobs.shape[0] // batch_size
+    top_n_tokens_tensor = top_n_tokens_tensor.repeat_interleave(speculate_size)
     # Ensure top_n doesn't exceed vocab size
-    top_n_tokens = [min(tok, logprobs.size(-1)) for tok in top_n_tokens]
+    top_n_tokens = [
+        min(tok, logprobs.size(-1))
+        for tok in top_n_tokens
+        for _ in range(speculate_size)
+    ]
 
     # Parallel kthvalue adapted from https://discuss.pytorch.org/t/how-to-efficiently-get-the-k-th-largest-values-in-parallel/160529/2
     # Sorted topk is faster than torch.sort() since we only need a small subset
-    sorted_top_k = torch.topk(logprobs, k=max_top_n, dim=1, sorted=True).values
+    sorted_top_k = torch.topk(logprobs, k=max_top_n, dim=-1, sorted=True).values
+
     nth_highest = torch.gather(
         sorted_top_k, 1, (top_n_tokens_tensor - 1).clip(min=0).unsqueeze(1)
     )
@@ -364,20 +604,43 @@ def batch_top_tokens(
     top_n_indices = (logprobs >= nth_highest).nonzero()
     _, top_n_ishes = torch.unique_consecutive(top_n_indices[:, 0], return_counts=True)
 
+    k = 1 if top_n_ishes.numel() == 0 else top_n_ishes.max()
     # Take a new topk for these new max n values
-    top_k = torch.topk(logprobs, k=top_n_ishes.max(), dim=1, sorted=True)
+    top_k = torch.topk(logprobs, k=k, dim=1, sorted=True)
 
     top_n_ishes = top_n_ishes.tolist()
     top_indices = top_k.indices.tolist()
     top_values = top_k.values.tolist()
 
-    return (
-        [
-            idxs[:n] if req_n > 0 else []
-            for idxs, n, req_n in zip(top_indices, top_n_ishes, top_n_tokens)
-        ],
-        [
-            vals[:n] if req_n > 0 else []
-            for vals, n, req_n in zip(top_values, top_n_ishes, top_n_tokens)
-        ],
-    )
+    batch_top_token_ids = []
+    batch_top_token_logprobs = []
+    accepted_ids_list = accepted_ids.tolist()
+    for i, n_accepted_ids in enumerate(accepted_ids_list):
+        start = speculate_size * i
+        stop = speculate_size * (i + 1)
+        _top_indices = top_indices[start:stop]
+        _top_values = top_values[start:stop]
+        _top_n_ishes = top_n_ishes[start:stop]
+        _top_n_tokens = top_n_tokens[start:stop]
+
+        _top_indices = _top_indices[:n_accepted_ids]
+        _top_values = _top_values[:n_accepted_ids]
+        _top_n_ishes = _top_n_ishes[:n_accepted_ids]
+        _top_n_tokens = _top_n_tokens[:n_accepted_ids]
+
+        row_top_token_ids = []
+        row_top_token_logprobs = []
+
+        for idxs, vals, n, req_n in zip(
+            _top_indices, _top_values, _top_n_ishes, _top_n_tokens
+        ):
+            indices = idxs[:n] if req_n > 0 else []
+            values = vals[:n] if req_n > 0 else []
+
+            row_top_token_ids.append(indices)
+            row_top_token_logprobs.append(values)
+
+        batch_top_token_ids.append(row_top_token_ids)
+        batch_top_token_logprobs.append(row_top_token_logprobs)
+
+    return batch_top_token_ids, batch_top_token_logprobs
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index ef662ce1..3731fd24 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -1,11 +1,13 @@
 import os
 from pathlib import Path
-from typing import List, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Union
 from safetensors import safe_open, SafetensorError
 import torch
 from loguru import logger
 from huggingface_hub import hf_hub_download
 import json
+from text_generation_server.layers.gptq import GPTQParams
+from text_generation_server.utils.log import log_once
 
 
 class Weights:
@@ -16,6 +18,7 @@ class Weights:
         dtype,
         process_group,
         aliases: Optional[Dict[str, List[str]]] = None,
+        prefix: Optional[str] = None,
     ):
         routing = {}
         for filename in filenames:
@@ -33,6 +36,7 @@ class Weights:
         self.device = device
         self.dtype = dtype
         self.process_group = process_group
+        self.prefix = prefix
         self._handles = {}
 
     def _get_handle(self, filename):
@@ -43,15 +47,21 @@ class Weights:
         return self._handles[filename]
 
     def get_filename(self, tensor_name: str) -> (str, str):
-        filename = self.routing.get(tensor_name, None)
-        if filename is None:
-            aliases = self.aliases.get(tensor_name, [])
+        names = [tensor_name]
+        if self.prefix is not None:
+            prefixed = f"{self.prefix}.{tensor_name}"
+            names.append(prefixed)
+        for name in names:
+            filename = self.routing.get(name, None)
+            if filename is not None:
+                return str(filename), name
+
+            aliases = self.aliases.get(name, [])
             for alias in aliases:
                 filename = self.routing.get(alias, None)
                 if filename is not None:
                     return str(filename), alias
-            raise RuntimeError(f"weight {tensor_name} does not exist")
-        return str(filename), tensor_name
+        raise RuntimeError(f"weight {tensor_name} does not exist")
 
     def _get_slice(self, tensor_name: str):
         filename, tensor_name = self.get_filename(tensor_name)
@@ -62,26 +72,28 @@ class Weights:
     def get_shape(self, tensor_name: str):
         return self._get_slice(tensor_name).get_shape()
 
-    def get_tensor(self, tensor_name: str):
+    def get_tensor(self, tensor_name: str, to_device=True):
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         tensor = f.get_tensor(tensor_name)
         # Special case for gptq which shouldn't convert
-        # u4 which are disguised as int32
-        if tensor.dtype not in [torch.int32, torch.int64]:
+        # u4 which are disguised as int32. Exl2 uses int16
+        # as well.
+        if tensor.dtype not in [torch.int16, torch.int32, torch.int64]:
             tensor = tensor.to(dtype=self.dtype)
-        tensor = tensor.to(device=self.device)
+        if to_device:
+            tensor = tensor.to(device=self.device)
         return tensor
 
     def get_partial_sharded(self, tensor_name: str, dim: int):
         filename, tensor_name = self.get_filename(tensor_name)
+        f = self._get_handle(filename)
+        slice_ = f.get_slice(tensor_name)
         world_size = self.process_group.size()
         rank = self.process_group.rank()
 
-        f = self._get_handle(filename)
-        slice_ = f.get_slice(tensor_name)
         size = slice_.get_shape()[dim]
-        block_size = size // world_size
+        block_size = (size + world_size - 1) // world_size
         start = rank * block_size
         stop = (rank + 1) * block_size
 
@@ -92,8 +104,8 @@ class Weights:
         else:
             raise NotImplementedError("Let's make that generic when needed")
         # Special case for gptq which shouldn't convert
-        # u4 which are disguised as int32
-        if tensor.dtype != torch.int32:
+        # u4 which are disguised as int32. exl2 uses int16.
+        if tensor.dtype not in (torch.int16, torch.int32):
             tensor = tensor.to(dtype=self.dtype)
         tensor = tensor.to(device=self.device)
         return tensor
@@ -109,51 +121,461 @@ class Weights:
         ), f"The choosen size {size} is not compatible with sharding on {world_size} shards"
         return self.get_partial_sharded(tensor_name, dim)
 
+    def get_packed_sharded(
+        self, tensor_name: str, dim: int, block_sizes: Union[int, List[int]]
+    ) -> torch.Tensor:
+        """
+        Get a shard from a tensor that packs multiple tensors.
+
+        When a tensor packs multiple tensors (such as QKV or an up
+        projection + gate projection), sharding with `get_sharded` is not
+        safe since it would not split the packed tensors across shards.
+
+        This method shards a tensor, such that the packed tensors are
+        split across shards.
+
+        The columns are split in equally sized blocks when blocks is an `int`, or
+        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
+        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
+        convenient for e.g. splitting QKV without knowing the storage details of
+        quantized weights.
+        """
+        slice_ = self._get_slice(tensor_name)
+        total_size = slice_.get_shape()[dim]
+        block_sizes = _blocks_to_block_sizes(total_size=total_size, blocks=block_sizes)
+
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+
+        tensors = []
+        block_offset = 0
+        for block_size in block_sizes:
+            assert (
+                block_size % world_size == 0
+            ), f"Prepacked tensor cannot be sharded across {world_size} shards"
+            shard_block_size = block_size // world_size
+            start = rank * shard_block_size
+            stop = (rank + 1) * shard_block_size
+            if dim == 0:
+                tensor = slice_[block_offset + start : block_offset + stop]
+            elif dim == 1:
+                tensor = slice_[:, block_offset + start : block_offset + stop]
+            else:
+                raise NotImplementedError("Currently only dim=0 or dim=1 is supported")
+            tensors.append(tensor)
+            block_offset += block_size
+        tensor = torch.cat(tensors, dim=dim)
+        tensor = tensor.to(device=self.device)
+
+        # Avoid casting quantizer dtypes.
+        if tensor.dtype not in [torch.int16, torch.int32, torch.int64]:
+            tensor = tensor.to(dtype=self.dtype)
+
+        return tensor
+
+    def get_weights_col_packed_qkv(
+        self,
+        prefix: str,
+        quantize: str,
+        num_heads: int,
+        num_key_value_heads: int,
+    ):
+        return self.get_weights_col_packed(
+            prefix, quantize, [num_heads, num_key_value_heads, num_key_value_heads]
+        )
+
+    def get_weights_col_packed_gate_up(self, prefix: str, quantize: str):
+        return self.get_weights_col_packed(prefix, quantize, 2)
+
+    def get_weights_col_packed(
+        self, prefix: str, quantize: str, block_sizes: Union[int, List[int]]
+    ):
+        """
+        Highly specific when the underlying tensor is a simple cat of Q,K,V instead of being
+        already alternating Q,K,V within the main tensor.
+
+        The columns are split in equally sized blocks when blocks is an `int`, or
+        in blocks proportional given to the sizes. For instance `[2, 1, 1]` will
+        divide an input with dimensionality `1024` in `[512, 256, 256]`. This is
+        convenient for e.g. splitting QKV without knowing the storage details of
+        quantized weights.
+        """
+        if quantize in ["gptq", "awq"]:
+            from text_generation_server.layers.gptq import GPTQWeight
+            from text_generation_server.layers.marlin import (
+                can_use_gptq_marlin,
+                repack_gptq_for_marlin,
+            )
+
+            try:
+                qweight = self.get_packed_sharded(
+                    f"{prefix}.qweight", dim=1, block_sizes=block_sizes
+                )
+            except RuntimeError:
+                raise RuntimeError(
+                    f"Cannot load `{quantize}` weight, make sure the model is already quantized."
+                )
+            scales = self.get_packed_sharded(
+                f"{prefix}.scales", dim=1, block_sizes=block_sizes
+            )
+            scales = scales.to(dtype=self.dtype)
+
+            gptq_params = self._get_gptq_params()
+            if can_use_gptq_marlin(gptq_params, quantize):
+                g_idx = self.get_tensor(f"{prefix}.g_idx")
+                return repack_gptq_for_marlin(
+                    qweight=qweight,
+                    scales=scales,
+                    g_idx=g_idx,
+                    bits=gptq_params.bits,
+                    desc_act=gptq_params.desc_act,
+                    groupsize=gptq_params.groupsize,
+                    sym=gptq_params.sym,
+                    sharded_infeatures=False,
+                )
+
+            qzeros = self.get_packed_sharded(
+                f"{prefix}.qzeros", dim=1, block_sizes=block_sizes
+            )
+            if quantize == "gptq" and gptq_params.quant_method == "gptq":
+                g_idx = self.get_tensor(f"{prefix}.g_idx")
+            elif quantize == "gptq" and gptq_params.quant_method == "awq":
+                log_once(
+                    logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+                )
+                from text_generation_server.layers.awq.conversion_utils import (
+                    fast_awq_to_gptq,
+                )
+
+                qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+                g_idx = (
+                    torch.arange(
+                        qweight.shape[0] * (32 // gptq_params.bits),
+                        device=qweight.device,
+                    )
+                    // gptq_params.groupsize
+                ).to(dtype=torch.int32)
+            else:
+                g_idx = None
+
+            weight = GPTQWeight(
+                qweight=qweight,
+                qzeros=qzeros,
+                scales=scales,
+                g_idx=g_idx,
+                bits=gptq_params.bits,
+                groupsize=gptq_params.groupsize,
+                use_exllama=False,
+            )
+        elif quantize == "marlin":
+            from text_generation_server.layers.marlin import (
+                GPTQMarlin24Weight,
+                MarlinWeight,
+                repack_gptq_for_marlin,
+            )
+
+            is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
+            if is_marlin_24:
+                B = self.get_packed_sharded(
+                    f"{prefix}.B_24", dim=1, block_sizes=block_sizes
+                )
+                B_meta = self.get_packed_sharded(
+                    f"{prefix}.B_meta", dim=1, block_sizes=block_sizes
+                )
+                s = self.get_packed_sharded(
+                    f"{prefix}.s", dim=1, block_sizes=block_sizes
+                )
+
+                gptq_params = self._get_gptq_params()
+                weight = GPTQMarlin24Weight(
+                    B=B, B_meta=B_meta, s=s, bits=gptq_params.bits
+                )
+            else:
+                B = self.get_packed_sharded(
+                    f"{prefix}.B", dim=1, block_sizes=block_sizes
+                )
+                s = self.get_packed_sharded(
+                    f"{prefix}.s", dim=1, block_sizes=block_sizes
+                )
+                weight = MarlinWeight(B=B, s=s)
+        else:
+            weight = self.get_packed_sharded(
+                f"{prefix}.weight", dim=0, block_sizes=block_sizes
+            )
+        return weight
+
+    def get_weights_col(self, prefix: str, quantize: str):
+        if quantize == "exl2":
+            from text_generation_server.layers.exl2 import Exl2Weight
+
+            try:
+                q_weight = self.get_tensor(f"{prefix}.q_weight")
+            except RuntimeError:
+                raise RuntimeError(
+                    f"Cannot load `exl2`-quantized weight, make sure the model is already quantized."
+                )
+
+            q_scale = self.get_tensor(f"{prefix}.q_scale")
+            q_invperm = self.get_tensor(f"{prefix}.q_invperm")
+            q_scale_max = self.get_tensor(f"{prefix}.q_scale_max")
+            q_groups = self.get_tensor(f"{prefix}.q_groups")
+
+            return Exl2Weight(
+                q_weight=q_weight,
+                q_scale=q_scale,
+                q_invperm=q_invperm,
+                q_scale_max=q_scale_max,
+                q_groups=q_groups,
+            )
+
+        return self.get_multi_weights_col([prefix], quantize, 0)
+
     def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
-        if quantize == "gptq":
+        if quantize == "exl2":
+            raise ValueError("get_multi_weights_col is not supported for exl2")
+        elif quantize in ["gptq", "awq"]:
+            from text_generation_server.layers.gptq import GPTQWeight
+            from text_generation_server.layers.marlin import (
+                can_use_gptq_marlin,
+                repack_gptq_for_marlin,
+            )
+
             try:
                 qweight = torch.cat(
                     [self.get_sharded(f"{p}.qweight", dim=1) for p in prefixes], dim=1
                 )
             except RuntimeError:
                 raise RuntimeError(
-                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+                    f"Cannot load `{quantize}` weight, make sure the model is already quantized"
+                )
+
+            scales = torch.cat(
+                [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
+            )
+
+            gptq_params = self._get_gptq_params()
+            if can_use_gptq_marlin(gptq_params, quantize):
+                w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
+                for w2 in w[1:]:
+                    torch.testing.assert_close(w2, w[0])
+                g_idx = w[0]
+
+                return repack_gptq_for_marlin(
+                    qweight=qweight,
+                    scales=scales,
+                    g_idx=g_idx,
+                    bits=gptq_params.bits,
+                    desc_act=gptq_params.desc_act,
+                    groupsize=gptq_params.groupsize,
+                    sym=gptq_params.sym,
+                    sharded_infeatures=False,
                 )
 
             qzeros = torch.cat(
                 [self.get_sharded(f"{p}.qzeros", dim=1) for p in prefixes], dim=1
             )
-            scales = torch.cat(
-                [self.get_sharded(f"{p}.scales", dim=1) for p in prefixes], dim=1
-            )
-            w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
-            for w2 in w[1:]:
-                torch.testing.assert_close(w2, w[0])
-            g_idx = w[0]
 
-            bits, groupsize = self._get_gptq_params()
-            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, False)
+            from text_generation_server.layers.gptq import HAS_EXLLAMA
+
+            use_exllama = (
+                gptq_params.bits == 4
+                and HAS_EXLLAMA
+                and quantize == "gptq"
+                and not gptq_params.desc_act
+            )
+
+            if quantize == "gptq" and gptq_params.quant_method == "gptq":
+                w = [self.get_tensor(f"{p}.g_idx") for p in prefixes]
+                for w2 in w[1:]:
+                    torch.testing.assert_close(w2, w[0])
+                g_idx = w[0]
+            elif quantize == "gptq" and gptq_params.quant_method == "awq":
+                log_once(
+                    logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+                )
+                from text_generation_server.layers.awq.conversion_utils import (
+                    fast_awq_to_gptq,
+                )
+
+                qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+                if use_exllama:
+                    g_idx = None
+                else:
+                    g_idx = (
+                        torch.arange(
+                            qweight.shape[0] * (32 // gptq_params.bits),
+                            device=qweight.device,
+                        )
+                        // gptq_params.groupsize
+                    ).to(dtype=torch.int32)
+            else:
+                g_idx = None
+
+            weight = GPTQWeight(
+                qweight=qweight,
+                qzeros=qzeros,
+                scales=scales,
+                g_idx=g_idx,
+                bits=gptq_params.bits,
+                groupsize=gptq_params.groupsize,
+                use_exllama=use_exllama,
+            )
+        elif quantize == "marlin":
+            from text_generation_server.layers.gptq import GPTQWeight
+            from text_generation_server.layers.marlin import (
+                GPTQMarlin24Weight,
+                MarlinWeight,
+            )
+
+            is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
+            if is_marlin_24:
+                try:
+                    B = torch.cat(
+                        [self.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1
+                    )
+                except RuntimeError:
+                    raise RuntimeError(
+                        f"Cannot load `{quantize}` weight, make sure the model is already quantized"
+                    )
+
+                B_meta = torch.cat(
+                    [self.get_sharded(f"{p}.B_meta", dim=1) for p in prefixes], dim=1
+                )
+
+                s = torch.cat(
+                    [self.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
+                )
+
+                gptq_params = self._get_gptq_params()
+                weight = GPTQMarlin24Weight(
+                    B=B, B_meta=B_meta, s=s, bits=gptq_params.bits
+                )
+            else:
+                try:
+                    B = torch.cat(
+                        [self.get_sharded(f"{p}.B", dim=1) for p in prefixes], dim=1
+                    )
+                except RuntimeError:
+                    raise RuntimeError(
+                        f"Cannot load `{quantize}` weight, make sure the model is already quantized"
+                    )
+                s = torch.cat(
+                    [self.get_sharded(f"{p}.s", dim=1) for p in prefixes], dim=1
+                )
+
+                weight = MarlinWeight(B=B, s=s)
+
         else:
             w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
             weight = torch.cat(w, dim=dim)
+
         return weight
 
-    def get_multi_weights_row(self, prefix: str, quantize: str):
-        if quantize == "gptq":
-            use_exllama = True
-            bits, groupsize = self._get_gptq_params()
+    def get_tensor_shard(self, var, dim):
+        world_size = self.process_group.size()
+        rank = self.process_group.rank()
+        block_size = var.size()[dim] // world_size
+        start = rank * block_size
+        stop = (rank + 1) * block_size
+        if dim == 0:
+            tensor = var[start:stop]
+        elif dim == 1:
+            tensor = var[:, start:stop]
+        else:
+            raise NotImplementedError("Let's make that generic when needed")
+        tensor = tensor.to(dtype=self.dtype)
+        tensor = tensor.to(device=self.device)
+        return tensor
 
-            if bits != 4:
+    def get_multi_weights_row(self, prefix: str, quantize: str):
+        if quantize == "exl2":
+            from text_generation_server.layers.exl2 import Exl2Weight
+
+            try:
+                q_weight = self.get_tensor(f"{prefix}.q_weight")
+            except RuntimeError:
+                raise RuntimeError(
+                    f"Cannot load `exl2`-quantized weight, make sure the model is already quantized."
+                )
+
+            q_scale = self.get_tensor(f"{prefix}.q_scale")
+            q_invperm = self.get_tensor(f"{prefix}.q_invperm")
+            q_scale_max = self.get_tensor(f"{prefix}.q_scale_max")
+            q_groups = self.get_tensor(f"{prefix}.q_groups")
+
+            return Exl2Weight(
+                q_weight=q_weight,
+                q_scale=q_scale,
+                q_invperm=q_invperm,
+                q_scale_max=q_scale_max,
+                q_groups=q_groups,
+            )
+
+        elif quantize == "gptq":
+            from text_generation_server.layers.marlin import (
+                can_use_gptq_marlin,
+                repack_gptq_for_marlin,
+            )
+
+            gptq_params = self._get_gptq_params()
+            if can_use_gptq_marlin(gptq_params, quantize):
+                log_once(logger.info, "Using GPTQ-Marlin kernels")
+                try:
+                    qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
+                except RuntimeError:
+                    raise RuntimeError(
+                        f"Cannot load `{quantize}` weight for GPTQ -> Marlin repacking, make sure the model is already quantized"
+                    )
+
+                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
+                if gptq_params.desc_act or gptq_params.groupsize == -1:
+                    scales = self.get_tensor(f"{prefix}.scales")
+                else:
+                    scales = self.get_sharded(f"{prefix}.scales", dim=0)
+
+                sharded_in_features = self.process_group.size() > 1
+
+                return repack_gptq_for_marlin(
+                    qweight=qweight,
+                    scales=scales,
+                    g_idx=g_idx,
+                    bits=gptq_params.bits,
+                    desc_act=gptq_params.desc_act,
+                    groupsize=gptq_params.groupsize,
+                    sym=gptq_params.sym,
+                    sharded_infeatures=sharded_in_features,
+                )
+
+            use_exllama = True
+            if gptq_params.bits != 4:
                 use_exllama = False
 
+            if gptq_params.desc_act:
+                log_once(logger.warning, "Disabling exllama because desc_act=True")
+                use_exllama = False
+
+            try:
+                qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
+            except RuntimeError:
+                raise RuntimeError(
+                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+                )
+
+            if gptq_params.quant_method == "gptq":
+                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
+            elif gptq_params.quant_method == "awq":
+                g_idx = None
+
             if self.process_group.size() > 1:
-                g_idx = self.get_tensor(f"{prefix}.g_idx")
                 if g_idx is not None:
                     if (
                         not torch.equal(
                             g_idx.cpu(),
                             torch.tensor(
-                                [i // groupsize for i in range(g_idx.shape[0])],
+                                [
+                                    i // gptq_params.groupsize
+                                    for i in range(g_idx.shape[0])
+                                ],
                                 dtype=torch.int32,
                             ),
                         )
@@ -163,74 +585,245 @@ class Weights:
                         # it would require to reorder input activations that are split unto several GPUs
                         use_exllama = False
 
+            from text_generation_server.layers.gptq import (
+                HAS_EXLLAMA,
+                CAN_EXLLAMA,
+                GPTQWeight,
+            )
+
+            if use_exllama:
+                if not HAS_EXLLAMA:
+                    if CAN_EXLLAMA:
+                        log_once(
+                            logger.warning,
+                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
+                        )
+                    use_exllama = False
+                else:
+                    log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
+
+            if use_exllama and gptq_params.groupsize != -1:
+                qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
+                scales = self.get_sharded(f"{prefix}.scales", dim=0)
+            else:
+                qzeros = self.get_tensor(f"{prefix}.qzeros")
+                scales = self.get_tensor(f"{prefix}.scales")
+
+            if use_exllama and g_idx is not None:
+                g_idx = g_idx - g_idx[0]
+
+            if gptq_params.quant_method == "awq":
+                log_once(
+                    logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
+                )
+                from text_generation_server.layers.awq.conversion_utils import (
+                    fast_awq_to_gptq,
+                )
+
+                qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
+                if use_exllama:
+                    g_idx = None
+                else:
+                    g_idx = (
+                        torch.arange(
+                            qweight.shape[0] * (32 // gptq_params.bits),
+                            device=qweight.device,
+                        )
+                        // gptq_params.groupsize
+                    ).to(dtype=torch.int32)
+
+            weight = GPTQWeight(
+                qweight=qweight,
+                qzeros=qzeros,
+                scales=scales,
+                g_idx=g_idx,
+                bits=gptq_params.bits,
+                groupsize=gptq_params.groupsize,
+                use_exllama=use_exllama,
+            )
+        elif quantize == "awq":
+            from text_generation_server.layers.gptq import GPTQWeight
+
+            gptq_params = self._get_gptq_params()
+
             try:
                 qweight = self.get_sharded(f"{prefix}.qweight", dim=0)
             except RuntimeError:
                 raise RuntimeError(
-                    "Cannot load `gptq` weight, make sure the model is already quantized, or quantize it with `text-generation-server quantize ORIGINAL_MODEL_ID NEW_MODEL_ID`"
+                    "Cannot load `awq` weight, make sure the model is already quantized"
                 )
 
-            from text_generation_server.utils.layers import HAS_EXLLAMA
+            qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
+            scales = self.get_sharded(f"{prefix}.scales", dim=0)
+            g_idx = None
+            use_exllama = False
 
-            if use_exllama:
-                if not HAS_EXLLAMA:
-                    logger.warning(
-                        "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
+            weight = GPTQWeight(
+                qweight=qweight,
+                qzeros=qzeros,
+                scales=scales,
+                g_idx=g_idx,
+                bits=gptq_params.bits,
+                groupsize=gptq_params.groupsize,
+                use_exllama=use_exllama,
+            )
+        elif quantize == "marlin":
+            from text_generation_server.layers.gptq import GPTQWeight
+            from text_generation_server.layers.marlin import (
+                GPTQMarlin24Weight,
+                MarlinWeight,
+            )
+
+            is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
+            if is_marlin_24:
+                try:
+                    B = self.get_sharded(f"{prefix}.B_24", dim=0)
+                except RuntimeError:
+                    raise RuntimeError(
+                        "Cannot load `marlin` 2:4 sparsity weight, make sure the model is already quantized."
                     )
-                    use_exllama = False
-                else:
-                    logger.info("Using exllama kernels")
 
-            if use_exllama:
-                if groupsize >= 0:
-                    # Exllama reorders the weights in advance and the activations on the fly, thus
-                    # the scales and zero-points do not need to be reordered.
-                    qzeros = self.get_sharded(f"{prefix}.qzeros", dim=0)
-                    scales = self.get_sharded(f"{prefix}.scales", dim=0)
+                B_meta = self.get_sharded(f"{prefix}.B_meta", dim=0)
+                num_groups = self._get_slice(f"{prefix}.s").get_shape()[0]
+                if num_groups == 1:
+                    # The number of groups is 1 when groupsize == -1. share
+                    # scales between all shards in this case.
+                    s = self.get_tensor(f"{prefix}.s")
                 else:
-                    qzeros = self.get_tensor(f"{prefix}.qzeros")
-                    scales = self.get_tensor(f"{prefix}.scales")
+                    s = self.get_sharded(f"{prefix}.s", dim=0)
 
-                # For tp > 1, at this point we know we do not use act-order
-                if self.process_group.size() == 1:
-                    g_idx = self.get_tensor(f"{prefix}.g_idx")
-                else:
-                    g_idx = None
+                gptq_params = self._get_gptq_params()
+                weight = GPTQMarlin24Weight(
+                    B=B, B_meta=B_meta, s=s, bits=gptq_params.bits
+                )
             else:
-                # The triton kernel reorders the scales/zero points instead of the weight/activation.
-                # Thus, each rank needs the full qzeros/scales.
-                qzeros = self.get_tensor(f"{prefix}.qzeros")
-                scales = self.get_tensor(f"{prefix}.scales")
-                g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
+                try:
+                    B = self.get_sharded(f"{prefix}.B", dim=0)
+                except RuntimeError:
+                    raise RuntimeError(
+                        "Cannot load `marlin` weight, make sure the model is already quantized."
+                    )
 
-            weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
+                num_groups = self._get_slice(f"{prefix}.s").get_shape()[0]
+                if num_groups == 1:
+                    # The number of groups is 1 when groupsize == -1. share
+                    # scales between all shards in this case.
+                    s = self.get_tensor(f"{prefix}.s")
+                else:
+                    s = self.get_sharded(f"{prefix}.s", dim=0)
+                weight = MarlinWeight(B=B, s=s)
         else:
             weight = self.get_sharded(f"{prefix}.weight", dim=1)
         return weight
 
-    def _get_gptq_params(self) -> Tuple[int, int]:
+    def _get_gptq_params(self) -> GPTQParams:
         try:
             bits = self.get_tensor("gptq_bits").item()
             groupsize = self.get_tensor("gptq_groupsize").item()
+            checkpoint_format = getattr(self, "gptq_checkpoint_format", None)
+            desc_act = False
+            sym = False
+            quant_method = "gptq"
         except (SafetensorError, RuntimeError) as e:
             try:
                 bits = self.gptq_bits
                 groupsize = self.gptq_groupsize
+                checkpoint_format = getattr(self, "gptq_checkpoint_format", None)
+                desc_act = getattr(self, "gptq_desc_act", False)
+                quant_method = getattr(self, "quant_method", "gptq")
+                sym = getattr(self, "sym", True)
             except Exception:
                 raise e
 
-        return bits, groupsize
+        return GPTQParams(
+            bits=bits,
+            checkpoint_format=checkpoint_format,
+            desc_act=desc_act,
+            groupsize=groupsize,
+            quant_method=quant_method,
+            sym=sym,
+        )
 
-    def _set_gptq_params(self, model_id):
-        filename = "quantize_config.json"
+    def _set_gptq_params(self, model_id, revision):
+        filename = "config.json"
         try:
             if os.path.exists(os.path.join(model_id, filename)):
                 filename = os.path.join(model_id, filename)
             else:
-                filename = hf_hub_download(model_id, filename=filename)
+                filename = hf_hub_download(
+                    model_id, filename=filename, revision=revision
+                )
             with open(filename, "r") as f:
                 data = json.load(f)
-            self.gptq_bits = data["bits"]
-            self.gptq_groupsize = data["group_size"]
+            self.gptq_bits = data["quantization_config"]["bits"]
+            self.gptq_groupsize = data["quantization_config"]["group_size"]
+            # Order is important here, desc_act is missing on some real models
+            self.quant_method = data["quantization_config"]["quant_method"]
+            self.gptq_checkpoint_format = data["quantization_config"].get(
+                "checkpoint_format"
+            )
+            self.gptq_sym = data["quantization_config"]["sym"]
+            self.gptq_desc_act = data["quantization_config"]["desc_act"]
         except Exception:
-            pass
+            filename = "quantize_config.json"
+            try:
+                if os.path.exists(os.path.join(model_id, filename)):
+                    filename = os.path.join(model_id, filename)
+                else:
+                    filename = hf_hub_download(
+                        model_id, filename=filename, revision=revision
+                    )
+                with open(filename, "r") as f:
+                    data = json.load(f)
+                self.gptq_bits = data["bits"]
+                self.gptq_groupsize = data["group_size"]
+                self.gptq_sym = data["sym"]
+                self.gptq_desc_act = data["desc_act"]
+                if "version" in data and data["version"] == "GEMM":
+                    self.quant_method = "awq"
+            except Exception:
+                filename = "quant_config.json"
+                try:
+                    if os.path.exists(os.path.join(model_id, filename)):
+                        filename = os.path.join(model_id, filename)
+                    else:
+                        filename = hf_hub_download(
+                            model_id, filename=filename, revision=revision
+                        )
+                    with open(filename, "r") as f:
+                        data = json.load(f)
+                    self.gptq_bits = data["w_bit"]
+                    self.gptq_groupsize = data["q_group_size"]
+                    self.gptq_desc_act = data["desc_act"]
+                    if "version" in data and data["version"] == "GEMM":
+                        self.quant_method = "awq"
+                except Exception:
+                    pass
+
+
+def _blocks_to_block_sizes(total_size: int, blocks: Union[int, List[int]]) -> List[int]:
+    """
+    Convert block count or proportions to block sizes.
+
+    This function accepts
+
+    - The number of blocks (int), in which case the block size is
+      total_size//blocks; or
+    - A list of block sizes (List[int]).
+
+    In the latter case, if sum(blocks) < total_size, the ratios between
+    the block sizes will be preserved. For instance, if blocks is
+    [2, 1, 1] and total_size is 1024, the returned block sizes are
+    [512, 256, 256].
+    """
+    if isinstance(blocks, list):
+        total_blocks = sum(blocks)
+        assert (
+            total_size % total_blocks == 0
+        ), f"Cannot split {total_size} in proportional blocks: {blocks}"
+        part_size = total_size // total_blocks
+        return [part_size * block for block in blocks]
+    else:
+        assert total_size % blocks == 0, f"Prepacked is not divisible by {blocks}"
+        single_size = total_size // blocks
+        return [single_size] * blocks
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
new file mode 100755
index 00000000..ea94dcd9
--- /dev/null
+++ b/tgi-entrypoint.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
+
+text-generation-launcher $@
diff --git a/update_doc.py b/update_doc.py
new file mode 100644
index 00000000..1ff94a2c
--- /dev/null
+++ b/update_doc.py
@@ -0,0 +1,186 @@
+import subprocess
+import argparse
+import ast
+import json
+import os
+
+TEMPLATE = """
+# Supported Models and Hardware
+
+Text Generation Inference enables serving optimized models on specific hardware for the highest performance. The following sections list which models are hardware are supported.
+
+## Supported Models
+
+SUPPORTED_MODELS
+
+If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:
+
+```python
+# for causal LMs/text-generation models
+AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
+# or, for text-to-text generation models
+AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")
+```
+
+If you wish to serve a supported model that already exists on a local folder, just point to the local folder.
+
+```bash
+text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>
+```
+"""
+
+
+def check_cli(check: bool):
+    output = subprocess.check_output(["text-generation-launcher", "--help"]).decode(
+        "utf-8"
+    )
+
+    wrap_code_blocks_flag = "<!-- WRAP CODE BLOCKS -->"
+    final_doc = f"# Text-generation-launcher arguments\n\n{wrap_code_blocks_flag}\n\n"
+
+    lines = output.split("\n")
+
+    header = ""
+    block = []
+    for line in lines:
+        if line.startswith("  -") or line.startswith("      -"):
+            rendered_block = "\n".join(block)
+            if header:
+                final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
+            else:
+                final_doc += f"```shell\n{rendered_block}\n```\n"
+            block = []
+            tokens = line.split("<")
+            if len(tokens) > 1:
+                header = tokens[-1][:-1]
+            else:
+                header = line.split("--")[-1]
+            header = header.upper().replace("-", "_")
+
+        block.append(line)
+
+    rendered_block = "\n".join(block)
+    final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
+    block = []
+
+    filename = "docs/source/basic_tutorials/launcher.md"
+    if check:
+        with open(filename, "r") as f:
+            doc = f.read()
+            if doc != final_doc:
+                tmp = "launcher.md"
+                with open(tmp, "w") as g:
+                    g.write(final_doc)
+                diff = subprocess.run(
+                    ["diff", tmp, filename], capture_output=True
+                ).stdout.decode("utf-8")
+                print(diff)
+                raise Exception(
+                    "Cli arguments Doc is not up-to-date, run `python update_doc.py` in order to update it"
+                )
+    else:
+        with open(filename, "w") as f:
+            f.write(final_doc)
+
+
+def check_supported_models(check: bool):
+    filename = "server/text_generation_server/models/__init__.py"
+    with open(filename, "r") as f:
+        tree = ast.parse(f.read())
+
+    enum_def = [
+        x for x in tree.body if isinstance(x, ast.ClassDef) and x.name == "ModelType"
+    ][0]
+    _locals = {}
+    _globals = {}
+    exec(f"import enum\n{ast.unparse(enum_def)}", _globals, _locals)
+    ModelType = _locals["ModelType"]
+    list_string = ""
+    for data in ModelType:
+        list_string += f"- [{data.value['name']}]({data.value['url']})"
+        if data.value.get("multimodal", None):
+            list_string += " (Multimodal)"
+        list_string += "\n"
+
+    final_doc = TEMPLATE.replace("SUPPORTED_MODELS", list_string)
+
+    filename = "docs/source/supported_models.md"
+    if check:
+        with open(filename, "r") as f:
+            doc = f.read()
+            if doc != final_doc:
+                tmp = "supported.md"
+                with open(tmp, "w") as g:
+                    g.write(final_doc)
+                diff = subprocess.run(
+                    ["diff", tmp, filename], capture_output=True
+                ).stdout.decode("utf-8")
+                print(diff)
+                raise Exception(
+                    "Supported models is not up-to-date, run `python update_doc.py` in order to update it"
+                )
+    else:
+        with open(filename, "w") as f:
+            f.write(final_doc)
+
+
+def get_openapi_schema():
+    try:
+        output = subprocess.check_output(["text-generation-router", "print-schema"])
+        return json.loads(output)
+    except subprocess.CalledProcessError as e:
+        print(f"Error running text-generation-router print-schema: {e}")
+        raise SystemExit(1)
+    except json.JSONDecodeError:
+        print("Error: Invalid JSON received from text-generation-router print-schema")
+        raise SystemExit(1)
+
+
+def check_openapi(check: bool):
+    new_openapi_data = get_openapi_schema()
+    filename = "docs/openapi.json"
+    tmp_filename = "openapi_tmp.json"
+
+    with open(tmp_filename, "w") as f:
+        json.dump(new_openapi_data, f, indent=2)
+
+    if check:
+        diff = subprocess.run(
+            [
+                "diff",
+                # allow for trailing whitespace since it's not significant
+                # and the precommit hook will remove it
+                "--ignore-trailing-space",
+                tmp_filename,
+                filename,
+            ],
+            capture_output=True,
+        ).stdout.decode()
+        os.remove(tmp_filename)
+
+        if diff:
+            print(diff)
+            raise Exception(
+                "OpenAPI documentation is not up-to-date, run `python update_doc.py` in order to update it"
+            )
+
+        return True
+    else:
+        os.rename(tmp_filename, filename)
+        print("OpenAPI documentation updated.")
+        return True
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--check", action="store_true")
+
+    args = parser.parse_args()
+
+    check_cli(args.check)
+    check_supported_models(args.check)
+    check_openapi(args.check)
+
+
+if __name__ == "__main__":
+    main()