Merge pull request #134 from kdamaszk/rebase_tgi_2.0

Rebase with TGI v2.0
2025-04-24 00:12:08 +00:00 · 2024-05-06 09:28:16 +02:00 · 2024-05-06 09:28:16 +02:00 · 81182bed76
commit 81182bed76
parent 91eb4e555f 0bbec634f9
272 changed files with 86587 additions and 4411 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -5,14 +5,14 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: | 
+      description: |
        Please share your system info with us (`text-generation-launcher --env` if installed locally).
-        The full command line used that causes issues: 
+        The full command line used that causes issues:
        OS version:
        Rust version (if self-compiling, `cargo version`):
        Model being used (`curl 127.0.0.1:8080/info | jq`):
          If local model please explicit the kind of model and/or equivalents.
-        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
        Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
        The current version being used:

@ -52,11 +52,11 @@ body:

      placeholder: |
        Steps to reproduce the behavior:
-          
+
          1.
          2.
          3.
-          
+

  - type: textarea
    id: expected-behavior
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -19,7 +19,7 @@ body:
      label: Motivation
      description: |
        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+

  - type: textarea
    id: contribution
--- a/.github/workflows/autodocs.yml
+++ b/.github/workflows/autodocs.yml
@ -6,15 +6,15 @@ on:
 jobs:
  update_docs:
    runs-on: ubuntu-latest
-    
+
    steps:
    - name: Checkout code
      uses: actions/checkout@v2
-    
+
    - name: Install Launcher
      id: install-launcher
      run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher
-    
+
    - name: Check launcher Docs are up-to-date
      run: |
        echo text-generation-launcher --help
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -146,11 +146,50 @@ jobs:
          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min

+  integration-tests:
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    needs:
+      - start-runner
+      - build-and-push-image # Wait for the docker image to be built
+    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    env:
+      DOCKER_VOLUME: /cache
+    steps:
+      - uses: actions/checkout@v2
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4.4.1
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Tailscale
+        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
+        with:
+          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+      - name: Prepare disks
+        run: |
+          sudo mkfs -t ext4 /dev/nvme1n1
+          sudo mkdir ${{ env.DOCKER_VOLUME }}
+          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
+      - name: Install
+        run: |
+          make install-integration-tests
+      - name: Run tests
+        run: |
+          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
+          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          pytest -s -vv integration-tests
+
  build-and-push-image-rocm:
    concurrency:
      group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
+    needs:
+      - start-runner
+      - build-and-push-image # Wait for the main docker image to be built
+      - integration-tests # Wait for the main integration-tests
    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    permissions:
      contents: write
@ -235,43 +274,6 @@ jobs:
          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min

-  integration-tests:
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
-      cancel-in-progress: true
-    needs:
-      - start-runner
-      - build-and-push-image # Wait for the docker image to be built
-      - build-and-push-image-rocm
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
-    env:
-      DOCKER_VOLUME: /cache
-    steps:
-      - uses: actions/checkout@v2
-      - name: Inject slug/short variables
-        uses: rlespinasse/github-slug-action@v4.4.1
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: 3.9
-      - name: Tailscale
-        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
-        with:
-          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
-      - name: Prepare disks
-        run: |
-          sudo mkfs -t ext4 /dev/nvme1n1
-          sudo mkdir ${{ env.DOCKER_VOLUME }}
-          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
-      - name: Install
-        run: |
-          make install-integration-tests
-      - name: Run tests
-        run: |
-          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
-          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          pytest -s -vv integration-tests
-
  stop-runner:
    name: Stop self-hosted EC2 runner
    needs:
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -16,4 +16,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: text-generation-inference
-      additional_args: --not_python_module 
+      additional_args: --not_python_module
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@ -1,12 +0,0 @@
-name: Delete doc comment
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -33,11 +33,18 @@ jobs:
      - name: Install Rust
        uses: actions-rs/toolchain@v1
        with:
-          toolchain: 1.71.0
+          # Released on: 28 December, 2023
+          # Branched from master on: 10 November, 2023
+          # https://releases.rs/docs/1.75.0/
+          toolchain: 1.75.0
          override: true
          components: rustfmt, clippy
      - name: Install Protoc
        uses: arduino/setup-protoc@v1
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
      - name: Install sccache
        run: |
          curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
@ -68,12 +75,11 @@ jobs:
          pip install pytest
          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          pytest -s -vv server/tests
-      - name: Run Rust fmt
+      - name: Pre-commit checks
        run: |
-          cargo fmt --check
-      - name: Run Rust clippy
-        run: |
-          cargo clippy
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
      - name: Run Rust tests
        run: |
          cargo test
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@ -13,4 +13,4 @@ jobs:
      package_name: text-generation-inference
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,12 @@
 target
 router/tokenizer.json
 *__pycache__*
+
+# ROCm auto-generated files
+*.hip
+server/exllamav2_kernels/exllamav2_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip/
+server/exllama_kernels/exllama_kernels/hip_func/
+*_hip.cuh
+server/exllama_kernels/exllama_kernels/hip_buffers.cuh
+server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,18 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        exclude: docs/source/basic_tutorials/launcher.md
+-   repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/doublify/pre-commit-rust
+    rev: v1.0
+    hooks:
+    -   id: fmt
+    -   id: cargo-check
+    -   id: clippy
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,7 +9,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "1.2.0"
+version = "2.0.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
@ -17,5 +17,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 [profile.release]
 debug = 1
 incremental = true
-lto = "off"
+lto = "fat"
+opt-level = 3
+codegen-units = 1
 panic = "abort"
--- a/10
+++ b/10
@ -31,7 +31,7 @@ COPY launcher launcher
 RUN cargo build --release

 # Text Generation Inference base image
-FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest as base
+FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as base

 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
@ -58,8 +58,8 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements.txt && \
-    bash ./dill-0.3.7-patch.sh && \
-    pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0 && \
+    bash ./dill-0.3.8-patch.sh && \
+    pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0 && \
    pip install . --no-cache-dir

 # Install benchmarker
@ -72,5 +72,7 @@ COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/
 # Final image
 FROM base

-ENTRYPOINT ["text-generation-launcher"]
+COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+
+ENTRYPOINT ["/tgi-entrypoint.sh"]
 CMD ["--json-output"]
--- a/30
+++ b/30
@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
 WORKDIR /usr/src

 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@ -36,7 +36,7 @@ COPY launcher launcher
 RUN cargo build --release

 # Text Generation Inference base image for RoCm
-FROM rocm/dev-ubuntu-20.04:5.7 as base
+FROM rocm/dev-ubuntu-22.04:5.7 as base

 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    build-essential \
@ -75,8 +75,8 @@ RUN chmod +x ~/mambaforge.sh && \
    mamba init && \
    rm ~/mambaforge.sh

-# Install PyTorch nightly (2.2.0.dev2023) compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
-RUN pip install --pre torch==2.2.0.dev20231106 --index-url https://download.pytorch.org/whl/nightly/rocm5.7
+# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
+RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/

 FROM base AS kernel-builder

@ -104,6 +104,20 @@ WORKDIR /usr/src
 COPY server/custom_kernels/ .
 RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build

+# Build exllama kernels
+FROM kernel-builder as exllama-kernels-builder
+WORKDIR /usr/src
+COPY server/exllama_kernels/ .
+
+RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
+
+# Build exllama v2 kernels
+FROM kernel-builder as exllamav2-kernels-builder
+WORKDIR /usr/src
+COPY server/exllamav2_kernels/ .
+
+RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build
+
 FROM base as base-copy

 # Text Generation Inference base env
@ -120,6 +134,12 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 # Copy build artifacts from custom kernels builder
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

+# Copy build artifacts from exllama kernels builder
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
+# Copy build artifacts from exllamav2 kernels builder
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir

@ -130,7 +150,7 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
-    pip install ".[accelerate, peft]" --no-cache-dir
+    pip install ".[accelerate, peft, outlines]" --no-cache-dir

 # Install benchmarker
 COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
--- a/318
+++ b/318
@ -1,181 +1,201 @@
-Hugging Face Optimized Inference License 1.0 (HFOILv1.0)
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-This License Agreement governs the use of the Software and its Modifications. It is a
-binding agreement between the Licensor and You.
+   1. Definitions.

-This License Agreement shall be referred to as Hugging Face Optimized Inference License
-1.0 or HFOILv1.0. We may publish revised versions of this License Agreement from time to
-time. Each version will be given a distinguished number.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-By downloading, accessing, modifying, distributing or otherwise using the Software, You
-consent to all of the terms and conditions below. So, if You do not agree with those,
-please do not download, access, modify, distribute, or use the Software.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.

+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.

-1. PERMISSIONS
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.

-You may use, modify and distribute the Software pursuant to the following terms and
-conditions:
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.

-Copyright License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free, copyright license to reproduce, prepare, publicly display,
-publicly perform, sublicense under the terms herein, and distribute the Software and
-Modifications of the Software.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.

-Patent License. Subject to the terms and conditions of this License Agreement and where
-and as applicable, each Contributor hereby grants You a perpetual, worldwide,
-non-exclusive, royalty-free patent license to make, have made, Use, offer to sell, sell,
-import, and otherwise transfer the Software, where such license applies only to those
-patent claims licensable by such Contributor that are necessarily infringed by their
-Contribution(s) alone or by combination of their Contribution(s) with the Software to
-which such Contribution(s) was submitted. If You institute patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Software
-or a Contribution incorporated within the Software constitutes direct or contributory
-patent infringement, then any rights granted to You under this License Agreement for the
-Software shall terminate as of the date such litigation is filed.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).

-No other rights. All rights not expressly granted herein are retained.
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.

+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."

-2. RESTRICTIONS
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.

-You may not distribute the Software as a hosted or managed, and paid service, where the
-service grants users access to any substantial set of the features or functionality of the
-Software. If you wish to do so, You will need to be granted additional rights from the
-Licensor which will be subject to a separate mutually agreed agreement.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.

-You may not sublicense the Software under any other terms than those listed in this
-License.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.

+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:

-3. OBLIGATIONS
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and

-When You modify the Software, You agree to: - attach a notice stating the Modifications of
-the Software You made; and - attach a notice stating that the Modifications of the
-Software are released under this License Agreement.
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and

-When You distribute the Software or Modifications of the Software, You agree to: - give
-any recipients of the Software a copy of this License Agreement; - retain all Explanatory
-Documentation; and if sharing the Modifications of the Software, add Explanatory
-Documentation documenting the changes made to create the Modifications of the Software; -
-retain all copyright, patent, trademark and attribution notices.
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and

+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.

-4. MISCELLANEOUS
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.

-Termination. Licensor reserves the right to restrict Use of the Software in violation of
-this License Agreement, upon which Your licenses will automatically terminate.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.

-Contributions. Unless You explicitly state otherwise, any Contribution intentionally
-submitted for inclusion in the Software by You to the Licensor shall be under the terms
-and conditions of this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify the terms of any
-separate license agreement you may have executed with Licensor regarding such
-Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.

-Trademarks and related. Nothing in this License Agreement permits You (i) to make Use of
-Licensors’ trademarks, trade names, or logos, (ii) otherwise suggest endorsement by
-Licensor, or (iii) misrepresent the relationship between the parties; and any rights not
-expressly granted herein are reserved by the Licensors.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.

-Output You generate. Licensor claims no rights in the Output. You agree not to contravene
-any provision as stated in the License Agreement with your Use of the Output.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.

-Disclaimer of Warranty. Except as expressly provided otherwise herein, and to the fullest
-extent permitted by law, Licensor provides the Software (and each Contributor provides its
-Contributions) AS IS, and Licensor disclaims all warranties or guarantees of any kind,
-express or implied, whether arising under any law or from any usage in trade, or otherwise
-including but not limited to the implied warranties of merchantability, non-infringement,
-quiet enjoyment, fitness for a particular purpose, or otherwise. You are solely
-responsible for determining the appropriateness of the Software and Modifications of the
-Software for your purposes (including your use or distribution of the Software and
-Modifications of the Software), and assume any risks associated with Your exercise of
-permissions under this License Agreement.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.

-Limitation of Liability. In no event and under no legal theory, whether in tort (including
-negligence), contract, or otherwise, unless required by applicable law (such as deliberate
-and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to
-You for damages, including any direct, indirect, special, incidental, or consequential
-damages of any character arising as a result of this License Agreement or out of the Use
-or inability to Use the Software (including but not limited to damages for loss of
-goodwill, work stoppage, computer failure or malfunction, model failure or malfunction, or
-any and all other commercial damages or losses), even if such Contributor has been advised
-of the possibility of such damages.
+   END OF TERMS AND CONDITIONS

-Accepting Warranty or Additional Liability. While sharing the Software or Modifications of
-the Software thereof, You may choose to offer and charge a fee for, acceptance of support,
-warranty, indemnity, or other liability obligations and/or rights consistent with this
-License Agreement. However, in accepting such obligations, You may act only on Your own
-behalf and on Your sole responsibility, not on behalf of Licensor or any other
-Contributor, and you hereby agree to indemnify, defend, and hold Licensor and each other
-Contributor (and their successors or assigns) harmless for any liability incurred by, or
-claims asserted against, such Licensor or Contributor (and their successors or assigns) by
-reason of your accepting any such warranty or additional liability.
+   APPENDIX: How to apply the Apache License to your work.

-Severability. This License Agreement is a license of copyright and patent rights and an
-agreement in contract between You and the Licensor. If any provision of this License
-Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall
-be unaffected thereby and remain valid as if such provision had not been set forth herein.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.

+   Copyright 2022 Hugging Face

-5. DEFINITIONS
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at

-“Contribution” refers to any work of authorship, including the original version of the
-Software and any Modifications of the Software that is intentionally submitted to Licensor
-for inclusion in the Software by the copyright owner or by an individual or entity
-authorized to submit on behalf of the copyright owner. For the purposes of this
-definition, “submitted” means any form of electronic, verbal, or written communication
-sent to the Licensor or its representatives, including but not limited to communication on
-electronic mailing lists, source code control systems, and issue tracking systems that are
-managed by, or on behalf of, the Licensor for the purpose of discussing and improving the
-Software, but excluding communication that is conspicuously marked or otherwise designated
-in writing by the copyright owner as “Not a Contribution.”
+       http://www.apache.org/licenses/LICENSE-2.0

-“Contributor” refers to Licensor and any individual or entity on behalf of whom a
-Contribution has been received by Licensor and subsequently incorporated within the
-Software.
-
-“Data” refers to a collection of information extracted from the dataset used with the
-Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not
-licensed under this License Agreement.
-
-“Explanatory Documentation” refers to any documentation or related information including
-but not limited to model cards or data cards dedicated to inform the public about the
-characteristics of the Software. Explanatory documentation is not licensed under this
-License.
-
-"License Agreement" refers to these terms and conditions.
-
-“Licensor” refers to the rights owners or entity authorized by the rights owners that are
-granting the terms and conditions of this License Agreement.
-
-“Model” refers to machine-learning based assemblies (including checkpoints), consisting of
-learnt weights and parameters (including optimizer states), corresponding to a model
-architecture as embodied in Software source code. Source code is not licensed under this
-License Agreement.
-
-“Modifications of the Software” refers to all changes to the Software, including without
-limitation derivative works of the Software.
-
-“Output” refers to the results of operating the Software.
-
-“Share” refers to any transmission, reproduction, publication or other sharing of the
-Software or Modifications of the Software to a third party, including providing the
-Softwaire as a hosted service made available by electronic or other remote means,
-including - but not limited to - API-based or web access.
-
-“Software” refers to the software and Model (or parts of either) that Licensor makes
-available under this License Agreement.
-
-“Third Parties” refers to individuals or legal entities that are not under common control
-with Licensor or You.
-
-“Use” refers to anything You or your representatives do with the Software, including but
-not limited to generating any Output, fine tuning, updating, running, training, evaluating
-and/or reparametrizing the Model.
-
-"You" (or "Your")  refers to an individual or Legal Entity exercising permissions granted
-by this License Agreement and/or making Use of the Software for whichever purpose and in
-any field of Use.
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@ -44,7 +44,7 @@ To use [🤗 text-generation-inference](https://github.com/huggingface/text-gene
   model=meta-llama/Llama-2-7b-hf
   volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-   docker run -p 8080:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model
+   docker run -p 8080:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model --max-input-length 1024 --max-total-tokens 2048
   ```
   > For gated models such as [LLama](https://huggingface.co/meta-llama) or [StarCoder](https://huggingface.co/bigcode/starcoder), you will have to pass `-e HUGGING_FACE_HUB_TOKEN=<token>` to the `docker run` command above with a valid Hugging Face Hub read token.

@ -53,7 +53,7 @@ To use [🤗 text-generation-inference](https://github.com/huggingface/text-gene
   model=meta-llama/Llama-2-70b-hf
   volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-   docker run -p 8080:80 -v $volume:/data --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model --sharded true --num-shard 8
+   docker run -p 8080:80 -v $volume:/data --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model --sharded true --num-shard 8 --max-input-length 1024 --max-total-tokens 2048
   ```
 3. You can then send a simple request:
   ```bash
--- a/assets/architecture.jpg
+++ b/assets/architecture.jpg
--- a/assets/architecture.png
+++ b/assets/architecture.png
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -29,4 +29,3 @@ tui = {package = "ratatui", version = "0.23", default-features = false, features
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 hf-hub = "0.3.1"
-
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -6,12 +6,12 @@

 </div>

-A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) 
+A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
 and powered by [tui](https://github.com/tui-rs-revival/ratatui).

-## Install 
+## Install

-```shell 
+```shell
 make install-benchmark
 ```

@ -27,4 +27,4 @@ Then run the benchmarking tool:

 ```shell
 text-generation-benchmark --tokenizer-name bigscience/bloom-560m
-```
+```
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@ -444,7 +444,7 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga
 }

 /// Throughput paragraph
-fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
+fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> Paragraph<'a> {
    // Throughput average/high/low texts
    let throughput_texts = statis_spans(throughput, "tokens/secs");

@ -457,7 +457,7 @@ fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragr
 }

 /// Latency paragraph
-fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
+fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Paragraph<'a> {
    // Latency average/high/low texts
    let mut latency_texts = statis_spans(latency, "ms");

@ -466,7 +466,7 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
    let latency_percentiles = crate::utils::percentiles(latency, &[50, 90, 99]);

    // Latency p50/p90/p99 texts
-    let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
+    let colors = [Color::LightGreen, Color::LightYellow, Color::LightRed];
    for (i, (name, value)) in latency_percentiles.iter().enumerate() {
        let span = Line::from(vec![Span::styled(
            format!("{name}:     {value:.2} ms"),
@ -483,7 +483,7 @@ fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragrap
 }

 /// Average/High/Low spans
-fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Line<'a>> {
+fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
    vec![
        Line::from(vec![Span::styled(
            format!(
@ -543,7 +543,7 @@ fn latency_histogram<'a>(

 /// Latency/Throughput chart
 fn latency_throughput_chart<'a>(
-    latency_throughput: &'a Vec<(f64, f64)>,
+    latency_throughput: &'a [(f64, f64)],
    batch_sizes: &'a [u32],
    zoom: bool,
    name: &'static str,
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -163,7 +163,7 @@ async fn prefill(

    // Run prefill
    let start_time = Instant::now();
-    let (_, decode_batch) = client.prefill(batch.clone()).await?;
+    let (_, decode_batch, _) = client.prefill(batch.clone()).await?;

    // Get latency
    let latency = start_time.elapsed();
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{NextTokenChooserParameters, ShardedClient};
+use text_generation_client::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;
@ -30,6 +30,7 @@ pub async fn run(
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
+    frequency_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
    client: ShardedClient,
@ -42,7 +43,10 @@ pub async fn run(
        do_sample,
        seed: 0,
        repetition_penalty: repetition_penalty.unwrap_or(1.0),
+        frequency_penalty: frequency_penalty.unwrap_or(0.0),
        watermark,
+        grammar: String::new(),
+        grammar_type: GrammarType::None as i32,
    };

    // Initialize terminal properties
@ -140,6 +144,7 @@ pub async fn run(
        top_p,
        typical_p,
        repetition_penalty,
+        frequency_penalty,
        watermark,
        do_sample,
    );
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@ -84,6 +84,11 @@ struct Args {
    #[clap(long, env)]
    repetition_penalty: Option<f32>,

+    /// Generation parameter in case you want to specifically test/debug particular
+    /// decoding strategies, for full doc refer to the `text-generation-server`
+    #[clap(long, env)]
+    frequency_penalty: Option<f32>,
+
    /// Generation parameter in case you want to specifically test/debug particular
    /// decoding strategies, for full doc refer to the `text-generation-server`
    #[clap(long, env)]
@ -119,6 +124,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        top_p,
        typical_p,
        repetition_penalty,
+        frequency_penalty,
        watermark,
        do_sample,
        master_shard_uds_path,
@ -187,6 +193,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                top_p,
                typical_p,
                repetition_penalty,
+                frequency_penalty,
                watermark,
                do_sample,
                sharded_client,
--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@ -15,6 +15,7 @@ pub(crate) fn parameters_table(
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
+    frequency_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
 ) -> Table {
@ -33,6 +34,7 @@ pub(crate) fn parameters_table(
    builder.push_record(["Top P", &format!("{top_p:?}")]);
    builder.push_record(["Typical P", &format!("{typical_p:?}")]);
    builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
+    builder.push_record(["Frequency Penalty", &format!("{frequency_penalty:?}")]);
    builder.push_record(["Watermark", &watermark.to_string()]);
    builder.push_record(["Do Sample", &do_sample.to_string()]);

@ -149,7 +151,7 @@ fn add_throuhgputs(
    }
 }

-fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
+fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
    let average = data.iter().sum::<f64>() / data.len() as f64;
    let min = data
        .iter()
@ -162,7 +164,7 @@ fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
    (average, *min, *max)
 }

-fn px(data: &Vec<f64>, p: u32) -> f64 {
+fn px(data: &[f64], p: u32) -> f64 {
    let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
    *data.get(i).unwrap_or(&std::f64::NAN)
 }
--- a/clients/python/.gitignore
+++ b/clients/python/.gitignore
@ -155,4 +155,4 @@ dmypy.json
 cython_debug/

 transformers
-safetensors
+safetensors
--- a/clients/python/Makefile
+++ b/clients/python/Makefile
@ -3,4 +3,4 @@ unit-tests:

 install:
 	pip install pip --upgrade
-	pip install -e .
+	pip install -e .
--- a/clients/python/README.md
+++ b/clients/python/README.md
@ -107,7 +107,19 @@ print(text)
 ### Types

 ```python
-# Request Parameters
+# enum for grammar type
+class GrammarType(Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar:
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
 class Parameters:
    # Activate logits sampling
    do_sample: bool
@ -116,6 +128,10 @@ class Parameters:
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float]
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float]
    # Whether to prepend the prompt to the generated text
    return_full_text: bool
    # Stop generating tokens if a member of `stop_sequences` is generated
@ -138,10 +154,22 @@ class Parameters:
    best_of: Optional[int]
    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
    watermark: bool
+    # Get generation details
+    details: bool
    # Get decoder input token logprobs and ids
    decoder_input_details: bool
    # Return the N most likely tokens at each step
-    top_n_tokens: Optional[int] 
+    top_n_tokens: Optional[int]
+    # grammar to use for generation
+    grammar: Optional[Grammar]
+
+class Request:
+    # Prompt
+    inputs: str
+    # Generation parameters
+    parameters: Optional[Parameters]
+    # Whether to stream output tokens
+    stream: bool

 # Decoder input tokens
 class InputToken:
@ -161,7 +189,7 @@ class Token:
    # Token text
    text: str
    # Logprob
-    logprob: float
+    logprob: Optional[float]
    # Is the token a special token
    # Can be used to ignore tokens when concatenating
    special: bool
@ -192,7 +220,7 @@ class BestOfSequence:
    # Generated tokens
    tokens: List[Token]
    # Most likely tokens
-    top_tokens: Optional[List[List[Token]]] 
+    top_tokens: Optional[List[List[Token]]]


 # `generate` details
@ -236,7 +264,7 @@ class StreamResponse:
    # Generated token
    token: Token
    # Most likely tokens
-    top_tokens: Optional[List[Token]] 
+    top_tokens: Optional[List[Token]]
    # Complete generated text
    # Only available when the generation is finished
    generated_text: Optional[str]
@ -248,4 +276,4 @@ class StreamResponse:
 class DeployedModel:
    model_id: str
    sha: str
-```
+```
--- a/clients/python/poetry.lock
+++ b/clients/python/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@ -707,18 +707,19 @@ files = [

 [[package]]
 name = "pydantic"
-version = "2.4.2"
+version = "2.5.3"
 description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-2.4.2-py3-none-any.whl", hash = "sha256:bc3ddf669d234f4220e6e1c4d96b061abe0998185a8d7855c0126782b7abc8c1"},
-    {file = "pydantic-2.4.2.tar.gz", hash = "sha256:94f336138093a5d7f426aac732dcfe7ab4eb4da243c88f891d65deb4a2556ee7"},
+    {file = "pydantic-2.5.3-py3-none-any.whl", hash = "sha256:d0caf5954bee831b6bfe7e338c32b9e30c85dfe080c843680783ac2b631673b4"},
+    {file = "pydantic-2.5.3.tar.gz", hash = "sha256:b3ef57c62535b0941697cce638c08900d87fcb67e29cfa99e8a68f747f393f7a"},
 ]

 [package.dependencies]
 annotated-types = ">=0.4.0"
-pydantic-core = "2.10.1"
+importlib-metadata = {version = "*", markers = "python_version == \"3.7\""}
+pydantic-core = "2.14.6"
 typing-extensions = ">=4.6.1"

 [package.extras]
@ -726,117 +727,116 @@ email = ["email-validator (>=2.0.0)"]

 [[package]]
 name = "pydantic-core"
-version = "2.10.1"
+version = "2.14.6"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic_core-2.10.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:d64728ee14e667ba27c66314b7d880b8eeb050e58ffc5fec3b7a109f8cddbd63"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:48525933fea744a3e7464c19bfede85df4aba79ce90c60b94d8b6e1eddd67096"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef337945bbd76cce390d1b2496ccf9f90b1c1242a3a7bc242ca4a9fc5993427a"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1392e0638af203cee360495fd2cfdd6054711f2db5175b6e9c3c461b76f5175"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0675ba5d22de54d07bccde38997e780044dcfa9a71aac9fd7d4d7a1d2e3e65f7"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:128552af70a64660f21cb0eb4876cbdadf1a1f9d5de820fed6421fa8de07c893"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f6e6aed5818c264412ac0598b581a002a9f050cb2637a84979859e70197aa9e"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ecaac27da855b8d73f92123e5f03612b04c5632fd0a476e469dfc47cd37d6b2e"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b3c01c2fb081fced3bbb3da78510693dc7121bb893a1f0f5f4b48013201f362e"},
-    {file = "pydantic_core-2.10.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:92f675fefa977625105708492850bcbc1182bfc3e997f8eecb866d1927c98ae6"},
-    {file = "pydantic_core-2.10.1-cp310-none-win32.whl", hash = "sha256:420a692b547736a8d8703c39ea935ab5d8f0d2573f8f123b0a294e49a73f214b"},
-    {file = "pydantic_core-2.10.1-cp310-none-win_amd64.whl", hash = "sha256:0880e239827b4b5b3e2ce05e6b766a7414e5f5aedc4523be6b68cfbc7f61c5d0"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:073d4a470b195d2b2245d0343569aac7e979d3a0dcce6c7d2af6d8a920ad0bea"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:600d04a7b342363058b9190d4e929a8e2e715c5682a70cc37d5ded1e0dd370b4"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39215d809470f4c8d1881758575b2abfb80174a9e8daf8f33b1d4379357e417c"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eeb3d3d6b399ffe55f9a04e09e635554012f1980696d6b0aca3e6cf42a17a03b"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a7902bf75779bc12ccfc508bfb7a4c47063f748ea3de87135d433a4cca7a2f"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3625578b6010c65964d177626fde80cf60d7f2e297d56b925cb5cdeda6e9925a"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:caa48fc31fc7243e50188197b5f0c4228956f97b954f76da157aae7f67269ae8"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:07ec6d7d929ae9c68f716195ce15e745b3e8fa122fc67698ac6498d802ed0fa4"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e6f31a17acede6a8cd1ae2d123ce04d8cca74056c9d456075f4f6f85de055607"},
-    {file = "pydantic_core-2.10.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8f1ebca515a03e5654f88411420fea6380fc841d1bea08effb28184e3d4899f"},
-    {file = "pydantic_core-2.10.1-cp311-none-win32.whl", hash = "sha256:6db2eb9654a85ada248afa5a6db5ff1cf0f7b16043a6b070adc4a5be68c716d6"},
-    {file = "pydantic_core-2.10.1-cp311-none-win_amd64.whl", hash = "sha256:4a5be350f922430997f240d25f8219f93b0c81e15f7b30b868b2fddfc2d05f27"},
-    {file = "pydantic_core-2.10.1-cp311-none-win_arm64.whl", hash = "sha256:5fdb39f67c779b183b0c853cd6b45f7db84b84e0571b3ef1c89cdb1dfc367325"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:b1f22a9ab44de5f082216270552aa54259db20189e68fc12484873d926426921"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8572cadbf4cfa95fb4187775b5ade2eaa93511f07947b38f4cd67cf10783b118"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db9a28c063c7c00844ae42a80203eb6d2d6bbb97070cfa00194dff40e6f545ab"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e2a35baa428181cb2270a15864ec6286822d3576f2ed0f4cd7f0c1708472aff"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05560ab976012bf40f25d5225a58bfa649bb897b87192a36c6fef1ab132540d7"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d6495008733c7521a89422d7a68efa0a0122c99a5861f06020ef5b1f51f9ba7c"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ac492c686defc8e6133e3a2d9eaf5261b3df26b8ae97450c1647286750b901"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8282bab177a9a3081fd3d0a0175a07a1e2bfb7fcbbd949519ea0980f8a07144d"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:aafdb89fdeb5fe165043896817eccd6434aee124d5ee9b354f92cd574ba5e78f"},
-    {file = "pydantic_core-2.10.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f6defd966ca3b187ec6c366604e9296f585021d922e666b99c47e78738b5666c"},
-    {file = "pydantic_core-2.10.1-cp312-none-win32.whl", hash = "sha256:7c4d1894fe112b0864c1fa75dffa045720a194b227bed12f4be7f6045b25209f"},
-    {file = "pydantic_core-2.10.1-cp312-none-win_amd64.whl", hash = "sha256:5994985da903d0b8a08e4935c46ed8daf5be1cf217489e673910951dc533d430"},
-    {file = "pydantic_core-2.10.1-cp312-none-win_arm64.whl", hash = "sha256:0d8a8adef23d86d8eceed3e32e9cca8879c7481c183f84ed1a8edc7df073af94"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:9badf8d45171d92387410b04639d73811b785b5161ecadabf056ea14d62d4ede"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:ebedb45b9feb7258fac0a268a3f6bec0a2ea4d9558f3d6f813f02ff3a6dc6698"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfe1090245c078720d250d19cb05d67e21a9cd7c257698ef139bc41cf6c27b4f"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e357571bb0efd65fd55f18db0a2fb0ed89d0bb1d41d906b138f088933ae618bb"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b3dcd587b69bbf54fc04ca157c2323b8911033e827fffaecf0cafa5a892a0904"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c120c9ce3b163b985a3b966bb701114beb1da4b0468b9b236fc754783d85aa3"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15d6bca84ffc966cc9976b09a18cf9543ed4d4ecbd97e7086f9ce9327ea48891"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5cabb9710f09d5d2e9e2748c3e3e20d991a4c5f96ed8f1132518f54ab2967221"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:82f55187a5bebae7d81d35b1e9aaea5e169d44819789837cdd4720d768c55d15"},
-    {file = "pydantic_core-2.10.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1d40f55222b233e98e3921df7811c27567f0e1a4411b93d4c5c0f4ce131bc42f"},
-    {file = "pydantic_core-2.10.1-cp37-none-win32.whl", hash = "sha256:14e09ff0b8fe6e46b93d36a878f6e4a3a98ba5303c76bb8e716f4878a3bee92c"},
-    {file = "pydantic_core-2.10.1-cp37-none-win_amd64.whl", hash = "sha256:1396e81b83516b9d5c9e26a924fa69164156c148c717131f54f586485ac3c15e"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:6835451b57c1b467b95ffb03a38bb75b52fb4dc2762bb1d9dbed8de31ea7d0fc"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b00bc4619f60c853556b35f83731bd817f989cba3e97dc792bb8c97941b8053a"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fa467fd300a6f046bdb248d40cd015b21b7576c168a6bb20aa22e595c8ffcdd"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d99277877daf2efe074eae6338453a4ed54a2d93fb4678ddfe1209a0c93a2468"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa7db7558607afeccb33c0e4bf1c9a9a835e26599e76af6fe2fcea45904083a6"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aad7bd686363d1ce4ee930ad39f14e1673248373f4a9d74d2b9554f06199fb58"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:443fed67d33aa85357464f297e3d26e570267d1af6fef1c21ca50921d2976302"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:042462d8d6ba707fd3ce9649e7bf268633a41018d6a998fb5fbacb7e928a183e"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ecdbde46235f3d560b18be0cb706c8e8ad1b965e5c13bbba7450c86064e96561"},
-    {file = "pydantic_core-2.10.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ed550ed05540c03f0e69e6d74ad58d026de61b9eaebebbaaf8873e585cbb18de"},
-    {file = "pydantic_core-2.10.1-cp38-none-win32.whl", hash = "sha256:8cdbbd92154db2fec4ec973d45c565e767ddc20aa6dbaf50142676484cbff8ee"},
-    {file = "pydantic_core-2.10.1-cp38-none-win_amd64.whl", hash = "sha256:9f6f3e2598604956480f6c8aa24a3384dbf6509fe995d97f6ca6103bb8c2534e"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:655f8f4c8d6a5963c9a0687793da37b9b681d9ad06f29438a3b2326d4e6b7970"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e570ffeb2170e116a5b17e83f19911020ac79d19c96f320cbfa1fa96b470185b"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64322bfa13e44c6c30c518729ef08fda6026b96d5c0be724b3c4ae4da939f875"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:485a91abe3a07c3a8d1e082ba29254eea3e2bb13cbbd4351ea4e5a21912cc9b0"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7c2b8eb9fc872e68b46eeaf835e86bccc3a58ba57d0eedc109cbb14177be531"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5cb87bdc2e5f620693148b5f8f842d293cae46c5f15a1b1bf7ceeed324a740c"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25bd966103890ccfa028841a8f30cebcf5875eeac8c4bde4fe221364c92f0c9a"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f323306d0556351735b54acbf82904fe30a27b6a7147153cbe6e19aaaa2aa429"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0c27f38dc4fbf07b358b2bc90edf35e82d1703e22ff2efa4af4ad5de1b3833e7"},
-    {file = "pydantic_core-2.10.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f1365e032a477c1430cfe0cf2856679529a2331426f8081172c4a74186f1d595"},
-    {file = "pydantic_core-2.10.1-cp39-none-win32.whl", hash = "sha256:a1c311fd06ab3b10805abb72109f01a134019739bd3286b8ae1bc2fc4e50c07a"},
-    {file = "pydantic_core-2.10.1-cp39-none-win_amd64.whl", hash = "sha256:ae8a8843b11dc0b03b57b52793e391f0122e740de3df1474814c700d2622950a"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d43002441932f9a9ea5d6f9efaa2e21458221a3a4b417a14027a1d530201ef1b"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fcb83175cc4936a5425dde3356f079ae03c0802bbdf8ff82c035f8a54b333521"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:962ed72424bf1f72334e2f1e61b68f16c0e596f024ca7ac5daf229f7c26e4208"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2cf5bb4dd67f20f3bbc1209ef572a259027c49e5ff694fa56bed62959b41e1f9"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e544246b859f17373bed915182ab841b80849ed9cf23f1f07b73b7c58baee5fb"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c0877239307b7e69d025b73774e88e86ce82f6ba6adf98f41069d5b0b78bd1bf"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:53df009d1e1ba40f696f8995683e067e3967101d4bb4ea6f667931b7d4a01357"},
-    {file = "pydantic_core-2.10.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a1254357f7e4c82e77c348dabf2d55f1d14d19d91ff025004775e70a6ef40ada"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:524ff0ca3baea164d6d93a32c58ac79eca9f6cf713586fdc0adb66a8cdeab96a"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f0ac9fb8608dbc6eaf17956bf623c9119b4db7dbb511650910a82e261e6600f"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:320f14bd4542a04ab23747ff2c8a778bde727158b606e2661349557f0770711e"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:63974d168b6233b4ed6a0046296803cb13c56637a7b8106564ab575926572a55"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:417243bf599ba1f1fef2bb8c543ceb918676954734e2dcb82bf162ae9d7bd514"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dda81e5ec82485155a19d9624cfcca9be88a405e2857354e5b089c2a982144b2"},
-    {file = "pydantic_core-2.10.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:14cfbb00959259e15d684505263d5a21732b31248a5dd4941f73a3be233865b9"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:631cb7415225954fdcc2a024119101946793e5923f6c4d73a5914d27eb3d3a05"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:bec7dd208a4182e99c5b6c501ce0b1f49de2802448d4056091f8e630b28e9a52"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:149b8a07712f45b332faee1a2258d8ef1fb4a36f88c0c17cb687f205c5dc6e7d"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d966c47f9dd73c2d32a809d2be529112d509321c5310ebf54076812e6ecd884"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7eb037106f5c6b3b0b864ad226b0b7ab58157124161d48e4b30c4a43fef8bc4b"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:154ea7c52e32dce13065dbb20a4a6f0cc012b4f667ac90d648d36b12007fa9f7"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e562617a45b5a9da5be4abe72b971d4f00bf8555eb29bb91ec2ef2be348cd132"},
-    {file = "pydantic_core-2.10.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:f23b55eb5464468f9e0e9a9935ce3ed2a870608d5f534025cd5536bca25b1402"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:e9121b4009339b0f751955baf4543a0bfd6bc3f8188f8056b1a25a2d45099934"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0523aeb76e03f753b58be33b26540880bac5aa54422e4462404c432230543f33"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e0e2959ef5d5b8dc9ef21e1a305a21a36e254e6a34432d00c72a92fdc5ecda5"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da01bec0a26befab4898ed83b362993c844b9a607a86add78604186297eb047e"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f2e9072d71c1f6cfc79a36d4484c82823c560e6f5599c43c1ca6b5cdbd54f881"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f36a3489d9e28fe4b67be9992a23029c3cec0babc3bd9afb39f49844a8c721c5"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f64f82cc3443149292b32387086d02a6c7fb39b8781563e0ca7b8d7d9cf72bd7"},
-    {file = "pydantic_core-2.10.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:b4a6db486ac8e99ae696e09efc8b2b9fea67b63c8f88ba7a1a16c24a057a0776"},
-    {file = "pydantic_core-2.10.1.tar.gz", hash = "sha256:0f8682dbdd2f67f8e1edddcbffcc29f60a6182b4901c367fc8c1c40d30bb0a82"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:72f9a942d739f09cd42fffe5dc759928217649f070056f03c70df14f5770acf9"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6a31d98c0d69776c2576dda4b77b8e0c69ad08e8b539c25c7d0ca0dc19a50d6c"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5aa90562bc079c6c290f0512b21768967f9968e4cfea84ea4ff5af5d917016e4"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:370ffecb5316ed23b667d99ce4debe53ea664b99cc37bfa2af47bc769056d534"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f85f3843bdb1fe80e8c206fe6eed7a1caeae897e496542cee499c374a85c6e08"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9862bf828112e19685b76ca499b379338fd4c5c269d897e218b2ae8fcb80139d"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036137b5ad0cb0004c75b579445a1efccd072387a36c7f217bb8efd1afbe5245"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92879bce89f91f4b2416eba4429c7b5ca22c45ef4a499c39f0c5c69257522c7c"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0c08de15d50fa190d577e8591f0329a643eeaed696d7771760295998aca6bc66"},
+    {file = "pydantic_core-2.14.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:36099c69f6b14fc2c49d7996cbf4f87ec4f0e66d1c74aa05228583225a07b590"},
+    {file = "pydantic_core-2.14.6-cp310-none-win32.whl", hash = "sha256:7be719e4d2ae6c314f72844ba9d69e38dff342bc360379f7c8537c48e23034b7"},
+    {file = "pydantic_core-2.14.6-cp310-none-win_amd64.whl", hash = "sha256:36fa402dcdc8ea7f1b0ddcf0df4254cc6b2e08f8cd80e7010d4c4ae6e86b2a87"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:dea7fcd62915fb150cdc373212141a30037e11b761fbced340e9db3379b892d4"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffff855100bc066ff2cd3aa4a60bc9534661816b110f0243e59503ec2df38421"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b027c86c66b8627eb90e57aee1f526df77dc6d8b354ec498be9a757d513b92b"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:00b1087dabcee0b0ffd104f9f53d7d3eaddfaa314cdd6726143af6bc713aa27e"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:75ec284328b60a4e91010c1acade0c30584f28a1f345bc8f72fe8b9e46ec6a96"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7e1f4744eea1501404b20b0ac059ff7e3f96a97d3e3f48ce27a139e053bb370b"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2602177668f89b38b9f84b7b3435d0a72511ddef45dc14446811759b82235a1"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6c8edaea3089bf908dd27da8f5d9e395c5b4dc092dbcce9b65e7156099b4b937"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:478e9e7b360dfec451daafe286998d4a1eeaecf6d69c427b834ae771cad4b622"},
+    {file = "pydantic_core-2.14.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b6ca36c12a5120bad343eef193cc0122928c5c7466121da7c20f41160ba00ba2"},
+    {file = "pydantic_core-2.14.6-cp311-none-win32.whl", hash = "sha256:2b8719037e570639e6b665a4050add43134d80b687288ba3ade18b22bbb29dd2"},
+    {file = "pydantic_core-2.14.6-cp311-none-win_amd64.whl", hash = "sha256:78ee52ecc088c61cce32b2d30a826f929e1708f7b9247dc3b921aec367dc1b23"},
+    {file = "pydantic_core-2.14.6-cp311-none-win_arm64.whl", hash = "sha256:a19b794f8fe6569472ff77602437ec4430f9b2b9ec7a1105cfd2232f9ba355e6"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:667aa2eac9cd0700af1ddb38b7b1ef246d8cf94c85637cbb03d7757ca4c3fdec"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdee837710ef6b56ebd20245b83799fce40b265b3b406e51e8ccc5b85b9099b7"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c5bcf3414367e29f83fd66f7de64509a8fd2368b1edf4351e862910727d3e51"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:26a92ae76f75d1915806b77cf459811e772d8f71fd1e4339c99750f0e7f6324f"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a983cca5ed1dd9a35e9e42ebf9f278d344603bfcb174ff99a5815f953925140a"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cb92f9061657287eded380d7dc455bbf115430b3aa4741bdc662d02977e7d0af"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4ace1e220b078c8e48e82c081e35002038657e4b37d403ce940fa679e57113b"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ef633add81832f4b56d3b4c9408b43d530dfca29e68fb1b797dcb861a2c734cd"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e90d6cc4aad2cc1f5e16ed56e46cebf4877c62403a311af20459c15da76fd91"},
+    {file = "pydantic_core-2.14.6-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e8a5ac97ea521d7bde7621d86c30e86b798cdecd985723c4ed737a2aa9e77d0c"},
+    {file = "pydantic_core-2.14.6-cp312-none-win32.whl", hash = "sha256:f27207e8ca3e5e021e2402ba942e5b4c629718e665c81b8b306f3c8b1ddbb786"},
+    {file = "pydantic_core-2.14.6-cp312-none-win_amd64.whl", hash = "sha256:b3e5fe4538001bb82e2295b8d2a39356a84694c97cb73a566dc36328b9f83b40"},
+    {file = "pydantic_core-2.14.6-cp312-none-win_arm64.whl", hash = "sha256:64634ccf9d671c6be242a664a33c4acf12882670b09b3f163cd00a24cffbd74e"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:24368e31be2c88bd69340fbfe741b405302993242ccb476c5c3ff48aeee1afe0"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e33b0834f1cf779aa839975f9d8755a7c2420510c0fa1e9fa0497de77cd35d2c"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6af4b3f52cc65f8a0bc8b1cd9676f8c21ef3e9132f21fed250f6958bd7223bed"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d15687d7d7f40333bd8266f3814c591c2e2cd263fa2116e314f60d82086e353a"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:095b707bb287bfd534044166ab767bec70a9bba3175dcdc3371782175c14e43c"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94fc0e6621e07d1e91c44e016cc0b189b48db053061cc22d6298a611de8071bb"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ce830e480f6774608dedfd4a90c42aac4a7af0a711f1b52f807130c2e434c06"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a306cdd2ad3a7d795d8e617a58c3a2ed0f76c8496fb7621b6cd514eb1532cae8"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2f5fa187bde8524b1e37ba894db13aadd64faa884657473b03a019f625cee9a8"},
+    {file = "pydantic_core-2.14.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:438027a975cc213a47c5d70672e0d29776082155cfae540c4e225716586be75e"},
+    {file = "pydantic_core-2.14.6-cp37-none-win32.whl", hash = "sha256:f96ae96a060a8072ceff4cfde89d261837b4294a4f28b84a28765470d502ccc6"},
+    {file = "pydantic_core-2.14.6-cp37-none-win_amd64.whl", hash = "sha256:e646c0e282e960345314f42f2cea5e0b5f56938c093541ea6dbf11aec2862391"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:db453f2da3f59a348f514cfbfeb042393b68720787bbef2b4c6068ea362c8149"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3860c62057acd95cc84044e758e47b18dcd8871a328ebc8ccdefd18b0d26a21b"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36026d8f99c58d7044413e1b819a67ca0e0b8ebe0f25e775e6c3d1fabb3c38fb"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8ed1af8692bd8d2a29d702f1a2e6065416d76897d726e45a1775b1444f5928a7"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:314ccc4264ce7d854941231cf71b592e30d8d368a71e50197c905874feacc8a8"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:982487f8931067a32e72d40ab6b47b1628a9c5d344be7f1a4e668fb462d2da42"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dbe357bc4ddda078f79d2a36fc1dd0494a7f2fad83a0a684465b6f24b46fe80"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2f6ffc6701a0eb28648c845f4945a194dc7ab3c651f535b81793251e1185ac3d"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7f5025db12fc6de7bc1104d826d5aee1d172f9ba6ca936bf6474c2148ac336c1"},
+    {file = "pydantic_core-2.14.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dab03ed811ed1c71d700ed08bde8431cf429bbe59e423394f0f4055f1ca0ea60"},
+    {file = "pydantic_core-2.14.6-cp38-none-win32.whl", hash = "sha256:dfcbebdb3c4b6f739a91769aea5ed615023f3c88cb70df812849aef634c25fbe"},
+    {file = "pydantic_core-2.14.6-cp38-none-win_amd64.whl", hash = "sha256:99b14dbea2fdb563d8b5a57c9badfcd72083f6006caf8e126b491519c7d64ca8"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:4ce8299b481bcb68e5c82002b96e411796b844d72b3e92a3fbedfe8e19813eab"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b9a9d92f10772d2a181b5ca339dee066ab7d1c9a34ae2421b2a52556e719756f"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd9e98b408384989ea4ab60206b8e100d8687da18b5c813c11e92fd8212a98e0"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4f86f1f318e56f5cbb282fe61eb84767aee743ebe32c7c0834690ebea50c0a6b"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86ce5fcfc3accf3a07a729779d0b86c5d0309a4764c897d86c11089be61da160"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dcf1978be02153c6a31692d4fbcc2a3f1db9da36039ead23173bc256ee3b91b"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eedf97be7bc3dbc8addcef4142f4b4164066df0c6f36397ae4aaed3eb187d8ab"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d5f916acf8afbcab6bacbb376ba7dc61f845367901ecd5e328fc4d4aef2fcab0"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8a14c192c1d724c3acbfb3f10a958c55a2638391319ce8078cb36c02283959b9"},
+    {file = "pydantic_core-2.14.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0348b1dc6b76041516e8a854ff95b21c55f5a411c3297d2ca52f5528e49d8411"},
+    {file = "pydantic_core-2.14.6-cp39-none-win32.whl", hash = "sha256:de2a0645a923ba57c5527497daf8ec5df69c6eadf869e9cd46e86349146e5975"},
+    {file = "pydantic_core-2.14.6-cp39-none-win_amd64.whl", hash = "sha256:aca48506a9c20f68ee61c87f2008f81f8ee99f8d7f0104bff3c47e2d148f89d9"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:d5c28525c19f5bb1e09511669bb57353d22b94cf8b65f3a8d141c389a55dec95"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:78d0768ee59baa3de0f4adac9e3748b4b1fffc52143caebddfd5ea2961595277"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b93785eadaef932e4fe9c6e12ba67beb1b3f1e5495631419c784ab87e975670"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a874f21f87c485310944b2b2734cd6d318765bcbb7515eead33af9641816506e"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89f4477d915ea43b4ceea6756f63f0288941b6443a2b28c69004fe07fde0d0d"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:172de779e2a153d36ee690dbc49c6db568d7b33b18dc56b69a7514aecbcf380d"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:dfcebb950aa7e667ec226a442722134539e77c575f6cfaa423f24371bb8d2e94"},
+    {file = "pydantic_core-2.14.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:55a23dcd98c858c0db44fc5c04fc7ed81c4b4d33c653a7c45ddaebf6563a2f66"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4241204e4b36ab5ae466ecec5c4c16527a054c69f99bba20f6f75232a6a534e2"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e574de99d735b3fc8364cba9912c2bec2da78775eba95cbb225ef7dda6acea24"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1302a54f87b5cd8528e4d6d1bf2133b6aa7c6122ff8e9dc5220fbc1e07bffebd"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8e81e4b55930e5ffab4a68db1af431629cf2e4066dbdbfef65348b8ab804ea8"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:c99462ffc538717b3e60151dfaf91125f637e801f5ab008f81c402f1dff0cd0f"},
+    {file = "pydantic_core-2.14.6-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e4cf2d5829f6963a5483ec01578ee76d329eb5caf330ecd05b3edd697e7d768a"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cf10b7d58ae4a1f07fccbf4a0a956d705356fea05fb4c70608bb6fa81d103cda"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:399ac0891c284fa8eb998bcfa323f2234858f5d2efca3950ae58c8f88830f145"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c6a5c79b28003543db3ba67d1df336f253a87d3112dac3a51b94f7d48e4c0e1"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:599c87d79cab2a6a2a9df4aefe0455e61e7d2aeede2f8577c1b7c0aec643ee8e"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43e166ad47ba900f2542a80d83f9fc65fe99eb63ceec4debec160ae729824052"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:3a0b5db001b98e1c649dd55afa928e75aa4087e587b9524a4992316fa23c9fba"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:747265448cb57a9f37572a488a57d873fd96bf51e5bb7edb52cfb37124516da4"},
+    {file = "pydantic_core-2.14.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:7ebe3416785f65c28f4f9441e916bfc8a54179c8dea73c23023f7086fa601c5d"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:86c963186ca5e50d5c8287b1d1c9d3f8f024cbe343d048c5bd282aec2d8641f2"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e0641b506486f0b4cd1500a2a65740243e8670a2549bb02bc4556a83af84ae03"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71d72ca5eaaa8d38c8df16b7deb1a2da4f650c41b58bb142f3fb75d5ad4a611f"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27e524624eace5c59af499cd97dc18bb201dc6a7a2da24bfc66ef151c69a5f2a"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3dde6cac75e0b0902778978d3b1646ca9f438654395a362cb21d9ad34b24acf"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:00646784f6cd993b1e1c0e7b0fdcbccc375d539db95555477771c27555e3c556"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:23598acb8ccaa3d1d875ef3b35cb6376535095e9405d91a3d57a8c7db5d29341"},
+    {file = "pydantic_core-2.14.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7f41533d7e3cf9520065f610b41ac1c76bc2161415955fbcead4981b22c7611e"},
+    {file = "pydantic_core-2.14.6.tar.gz", hash = "sha256:1fd0c1d395372843fba13a51c28e3bb9d59bd7aebfeb17358ffaaa1e4dbbe948"},
 ]

 [package.dependencies]
@ -928,6 +928,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.6.1"
+version = "0.7.0"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
@ -12,7 +12,7 @@ repository = "https://github.com/huggingface/text-generation-inference"

 [tool.poetry.dependencies]
 python = "^3.7"
-pydantic = "> 1.10, < 3"
+pydantic = "> 2, < 3"
 aiohttp = "^3.8"
 huggingface-hub = ">= 0.12, < 1.0"

--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@ -3,13 +3,19 @@ import requests

 from aiohttp import ClientSession, ClientTimeout
 from pydantic import ValidationError
-from typing import Dict, Optional, List, AsyncIterator, Iterator
+from typing import Dict, Optional, List, AsyncIterator, Iterator, Union

 from text_generation.types import (
    StreamResponse,
    Response,
    Request,
    Parameters,
+    Grammar,
+    ChatRequest,
+    ChatCompletionChunk,
+    ChatComplete,
+    Message,
+    Tool,
 )
 from text_generation.errors import parse_error

@ -58,6 +64,120 @@ class Client:
        self.cookies = cookies
        self.timeout = timeout

+    def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_choice: Optional[str] = None,
+    ):
+        """
+        Given a list of messages, generate a response asynchronously
+
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_choice (`str`):
+                The tool to use
+
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_choice=tool_choice,
+        )
+        if not stream:
+            resp = requests.post(
+                f"{self.base_url}/v1/chat/completions",
+                json=request.dict(),
+                headers=self.headers,
+                cookies=self.cookies,
+                timeout=self.timeout,
+            )
+            payload = resp.json()
+            if resp.status_code != 200:
+                raise parse_error(resp.status_code, payload)
+            return ChatComplete(**payload)
+        else:
+            return self._chat_stream_response(request)
+
+    def _chat_stream_response(self, request):
+        resp = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+        # iterate and print stream
+        for byte_payload in resp.iter_lines():
+            if byte_payload == b"\n":
+                continue
+            payload = byte_payload.decode("utf-8")
+            if payload.startswith("data:"):
+                json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                try:
+                    response = ChatCompletionChunk(**json_payload)
+                    yield response
+                except ValidationError:
+                    raise parse_error(resp.status, json_payload)
+
    def generate(
        self,
        prompt: str,
@ -65,6 +185,7 @@ class Client:
        max_new_tokens: int = 20,
        best_of: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -76,6 +197,7 @@ class Client:
        watermark: bool = False,
        decoder_input_details: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> Response:
        """
        Given a prompt, generate the following text
@ -92,6 +214,10 @@ class Client:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -116,6 +242,9 @@ class Client:
                Return the decoder input token logprobs and ids
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            Response: generated response
@ -127,6 +256,7 @@ class Client:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -138,6 +268,7 @@ class Client:
            watermark=watermark,
            decoder_input_details=decoder_input_details,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=False, parameters=parameters)

@ -159,6 +290,7 @@ class Client:
        do_sample: bool = False,
        max_new_tokens: int = 20,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -169,6 +301,7 @@ class Client:
        typical_p: Optional[float] = None,
        watermark: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> Iterator[StreamResponse]:
        """
        Given a prompt, generate the following stream of tokens
@ -183,6 +316,10 @@ class Client:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -205,6 +342,9 @@ class Client:
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            Iterator[StreamResponse]: stream of generated tokens
@ -217,6 +357,7 @@ class Client:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -227,6 +368,7 @@ class Client:
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=True, parameters=parameters)

@ -306,7 +448,120 @@ class AsyncClient:
        self.base_url = base_url
        self.headers = headers
        self.cookies = cookies
-        self.timeout = ClientTimeout(timeout * 60)
+        self.timeout = ClientTimeout(timeout)
+
+    async def chat(
+        self,
+        messages: List[Message],
+        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[List[float]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        stream: bool = False,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        tools: Optional[List[Tool]] = None,
+        tool_choice: Optional[str] = None,
+    ) -> Union[ChatComplete, AsyncIterator[ChatCompletionChunk]]:
+        """
+        Given a list of messages, generate a response asynchronously
+
+        Args:
+            messages (`List[Message]`):
+                List of messages
+            repetition_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 0.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
+            logit_bias (`List[float]`):
+                Adjust the likelihood of specified tokens
+            logprobs (`bool`):
+                Include log probabilities in the response
+            top_logprobs (`int`):
+                Include the `n` most likely tokens at each step
+            max_tokens (`int`):
+                Maximum number of generated tokens
+            n (`int`):
+                Generate `n` completions
+            presence_penalty (`float`):
+                The parameter for presence penalty. 0.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            stream (`bool`):
+                Stream the response
+            seed (`int`):
+                Random sampling seed
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation
+            tools (`List[Tool]`):
+                List of tools to use
+            tool_choice (`str`):
+                The tool to use
+
+        """
+        request = ChatRequest(
+            model="tgi",
+            messages=messages,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            top_logprobs=top_logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            stream=stream,
+            seed=seed,
+            temperature=temperature,
+            top_p=top_p,
+            tools=tools,
+            tool_choice=tool_choice,
+        )
+        if not stream:
+            return await self._chat_single_response(request)
+        else:
+            return self._chat_stream_response(request)
+
+    async def _chat_single_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                payload = await resp.json()
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return ChatComplete(**payload)
+
+    async def _chat_stream_response(self, request):
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(
+                f"{self.base_url}/v1/chat/completions", json=request.dict()
+            ) as resp:
+                async for byte_payload in resp.content:
+                    if byte_payload == b"\n":
+                        continue
+                    payload = byte_payload.decode("utf-8")
+                    if payload.startswith("data:"):
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("\n"))
+                        try:
+                            response = ChatCompletionChunk(**json_payload)
+                            yield response
+                        except ValidationError:
+                            raise parse_error(resp.status, json_payload)

    async def generate(
        self,
@ -315,6 +570,7 @@ class AsyncClient:
        max_new_tokens: int = 20,
        best_of: Optional[int] = None,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -326,6 +582,7 @@ class AsyncClient:
        watermark: bool = False,
        decoder_input_details: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> Response:
        """
        Given a prompt, generate the following text asynchronously
@ -342,6 +599,10 @@ class AsyncClient:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -366,10 +627,14 @@ class AsyncClient:
                Return the decoder input token logprobs and ids
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            Response: generated response
        """
+
        # Validate parameters
        parameters = Parameters(
            best_of=best_of,
@ -378,6 +643,7 @@ class AsyncClient:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -388,6 +654,7 @@ class AsyncClient:
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=False, parameters=parameters)

@ -407,6 +674,7 @@ class AsyncClient:
        do_sample: bool = False,
        max_new_tokens: int = 20,
        repetition_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
        return_full_text: bool = False,
        seed: Optional[int] = None,
        stop_sequences: Optional[List[str]] = None,
@ -417,6 +685,7 @@ class AsyncClient:
        typical_p: Optional[float] = None,
        watermark: bool = False,
        top_n_tokens: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
    ) -> AsyncIterator[StreamResponse]:
        """
        Given a prompt, generate the following stream of tokens asynchronously
@ -431,6 +700,10 @@ class AsyncClient:
            repetition_penalty (`float`):
                The parameter for repetition penalty. 1.0 means no penalty. See [this
                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            frequency_penalty (`float`):
+                The parameter for frequency penalty. 1.0 means no penalty
+                Penalize new tokens based on their existing frequency in the text so far,
+                decreasing the model's likelihood to repeat the same line verbatim.
            return_full_text (`bool`):
                Whether to prepend the prompt to the generated text
            seed (`int`):
@ -453,6 +726,9 @@ class AsyncClient:
                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
            top_n_tokens (`int`):
                Return the `n` most likely tokens at each step
+            grammar (`Grammar`):
+                Whether to use a grammar for the generation and the grammar to use. Grammars will constrain the generation
+                of the text to match a regular expression or JSON schema.

        Returns:
            AsyncIterator[StreamResponse]: stream of generated tokens
@ -465,6 +741,7 @@ class AsyncClient:
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
            return_full_text=return_full_text,
            seed=seed,
            stop=stop_sequences if stop_sequences is not None else [],
@ -475,6 +752,7 @@ class AsyncClient:
            typical_p=typical_p,
            watermark=watermark,
            top_n_tokens=top_n_tokens,
+            grammar=grammar,
        )
        request = Request(inputs=prompt, stream=True, parameters=parameters)

--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@ -1,10 +1,147 @@
 from enum import Enum
-from pydantic import BaseModel, validator
-from typing import Optional, List
+from pydantic import BaseModel, field_validator
+from typing import Optional, List, Union, Any

 from text_generation.errors import ValidationError


+# enum for grammar type
+class GrammarType(str, Enum):
+    Json = "json"
+    Regex = "regex"
+
+
+# Grammar type and value
+class Grammar(BaseModel):
+    # Grammar type
+    type: GrammarType
+    # Grammar value
+    value: Union[str, dict]
+
+
+class ToolCall(BaseModel):
+    # Id of the tool call
+    id: int
+    # Type of the tool call
+    type: str
+    # Function details of the tool call
+    function: dict
+
+
+class Message(BaseModel):
+    # Role of the message sender
+    role: str
+    # Content of the message
+    content: Optional[str] = None
+    # Optional name of the message sender
+    name: Optional[str] = None
+    # Tool calls associated with the chat completion
+    tool_calls: Optional[Any] = None
+
+
+class Tool(BaseModel):
+    # Type of the tool
+    type: str
+    # Function details of the tool
+    function: dict
+
+
+class ChatCompletionComplete(BaseModel):
+    # Index of the chat completion
+    index: int
+    # Message associated with the chat completion
+    message: Message
+    # Log probabilities for the chat completion
+    logprobs: Optional[Any]
+    # Reason for completion
+    finish_reason: str
+    # Usage details of the chat completion
+    usage: Optional[Any] = None
+
+
+class Function(BaseModel):
+    name: Optional[str]
+    arguments: str
+
+
+class ChoiceDeltaToolCall(BaseModel):
+    index: int
+    id: str
+    type: str
+    function: Function
+
+
+class ChoiceDelta(BaseModel):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[ChoiceDeltaToolCall]
+
+
+class Choice(BaseModel):
+    index: int
+    delta: ChoiceDelta
+    logprobs: Optional[dict] = None
+    finish_reason: Optional[str] = None
+
+
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[Choice]
+
+
+class ChatComplete(BaseModel):
+    # Chat completion details
+    id: str
+    object: str
+    created: int
+    model: str
+    system_fingerprint: str
+    choices: List[ChatCompletionComplete]
+    usage: Any
+
+
+class ChatRequest(BaseModel):
+    # Model identifier
+    model: str
+    # List of messages in the conversation
+    messages: List[Message]
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
+    # Bias values for token selection
+    logit_bias: Optional[List[float]] = None
+    # Whether to return log probabilities
+    logprobs: Optional[bool] = None
+    # Number of most likely tokens to return at each position
+    top_logprobs: Optional[int] = None
+    # Maximum number of tokens to generate
+    max_tokens: Optional[int] = None
+    # Number of chat completion choices to generate
+    n: Optional[int] = None
+    # Penalty for presence of new tokens
+    presence_penalty: Optional[float] = None
+    # Flag to indicate streaming response
+    stream: bool = False
+    # Random sampling seed
+    seed: Optional[int] = None
+    # Sampling temperature
+    temperature: Optional[float] = None
+    # Top-p value for nucleus sampling
+    top_p: Optional[float] = None
+    # List of tools to be used
+    tools: Optional[List[Tool]] = None
+    # Choice of tool to be used
+    tool_choice: Optional[str] = None
+
+
 class Parameters(BaseModel):
    # Activate logits sampling
    do_sample: bool = False
@ -13,6 +150,10 @@ class Parameters(BaseModel):
    # The parameter for repetition penalty. 1.0 means no penalty.
    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
    repetition_penalty: Optional[float] = None
+    # The parameter for frequency penalty. 1.0 means no penalty
+    # Penalize new tokens based on their existing frequency in the text so far,
+    # decreasing the model's likelihood to repeat the same line verbatim.
+    frequency_penalty: Optional[float] = None
    # Whether to prepend the prompt to the generated text
    return_full_text: bool = False
    # Stop generating tokens if a member of `stop_sequences` is generated
@ -41,74 +182,91 @@ class Parameters(BaseModel):
    decoder_input_details: bool = False
    # Return the N most likely tokens at each step
    top_n_tokens: Optional[int] = None
+    # grammar to use for generation
+    grammar: Optional[Grammar] = None

-    @validator("best_of")
+    @field_validator("best_of")
    def valid_best_of(cls, field_value, values):
        if field_value is not None:
            if field_value <= 0:
                raise ValidationError("`best_of` must be strictly positive")
-            if field_value > 1 and values["seed"] is not None:
+            if field_value > 1 and values.data["seed"] is not None:
                raise ValidationError("`seed` must not be set when `best_of` is > 1")
            sampling = (
-                values["do_sample"]
-                | (values["temperature"] is not None)
-                | (values["top_k"] is not None)
-                | (values["top_p"] is not None)
-                | (values["typical_p"] is not None)
+                values.data["do_sample"]
+                | (values.data["temperature"] is not None)
+                | (values.data["top_k"] is not None)
+                | (values.data["top_p"] is not None)
+                | (values.data["typical_p"] is not None)
            )
            if field_value > 1 and not sampling:
                raise ValidationError("you must use sampling when `best_of` is > 1")

        return field_value

-    @validator("repetition_penalty")
+    @field_validator("repetition_penalty")
    def valid_repetition_penalty(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`repetition_penalty` must be strictly positive")
        return v

-    @validator("seed")
+    @field_validator("frequency_penalty")
+    def valid_frequency_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`frequency_penalty` must be strictly positive")
+        return v
+
+    @field_validator("seed")
    def valid_seed(cls, v):
        if v is not None and v < 0:
            raise ValidationError("`seed` must be positive")
        return v

-    @validator("temperature")
+    @field_validator("temperature")
    def valid_temp(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`temperature` must be strictly positive")
        return v

-    @validator("top_k")
+    @field_validator("top_k")
    def valid_top_k(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`top_k` must be strictly positive")
        return v

-    @validator("top_p")
+    @field_validator("top_p")
    def valid_top_p(cls, v):
        if v is not None and (v <= 0 or v >= 1.0):
            raise ValidationError("`top_p` must be > 0.0 and < 1.0")
        return v

-    @validator("truncate")
+    @field_validator("truncate")
    def valid_truncate(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`truncate` must be strictly positive")
        return v

-    @validator("typical_p")
+    @field_validator("typical_p")
    def valid_typical_p(cls, v):
        if v is not None and (v <= 0 or v >= 1.0):
            raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
        return v

-    @validator("top_n_tokens")
+    @field_validator("top_n_tokens")
    def valid_top_n_tokens(cls, v):
        if v is not None and v <= 0:
            raise ValidationError("`top_n_tokens` must be strictly positive")
        return v

+    @field_validator("grammar")
+    def valid_grammar(cls, v):
+        if v is not None:
+            if v.type == GrammarType.Regex and not v.value:
+                raise ValidationError("`value` cannot be empty for `regex` grammar")
+            if v.type == GrammarType.Json and not v.value:
+                raise ValidationError("`value` cannot be empty for `json` grammar")
+        return v
+

 class Request(BaseModel):
    # Prompt
@ -118,15 +276,15 @@ class Request(BaseModel):
    # Whether to stream output tokens
    stream: bool = False

-    @validator("inputs")
+    @field_validator("inputs")
    def valid_input(cls, v):
        if not v:
            raise ValidationError("`inputs` cannot be empty")
        return v

-    @validator("stream")
+    @field_validator("stream")
    def valid_best_of_stream(cls, field_value, values):
-        parameters = values["parameters"]
+        parameters = values.data["parameters"]
        if (
            parameters is not None
            and parameters.best_of is not None
@ -157,7 +315,7 @@ class Token(BaseModel):
    # Token text
    text: str
    # Logprob
-    logprob: float
+    logprob: Optional[float] = None
    # Is the token a special token
    # Can be used to ignore tokens when concatenating
    special: bool
--- a/docs/index.html
+++ b/docs/index.html
@ -27,4 +27,4 @@
            }
        </script>
    </body>
-</html>
+</html>
--- a/docs/openapi.json
+++ b/docs/openapi.json
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -7,6 +7,8 @@
    title: Installation
  - local: supported_models
    title: Supported Models and Hardware
+  - local: messages_api
+    title: Messages API
  title: Getting started
 - sections:
  - local: basic_tutorials/consuming_tgi
@ -21,6 +23,8 @@
    title: All TGI CLI  options
  - local: basic_tutorials/non_core_models
    title: Non-core Model Serving
+  - local: basic_tutorials/safety
+    title: Safety
  title: Tutorials
 - sections:
  - local: conceptual/streaming
@ -35,4 +39,8 @@
    title: Safetensors
  - local: conceptual/flash_attention
    title: Flash Attention
+  - local: conceptual/speculation
+    title: Speculation (Medusa, ngram)
+  - local: conceptual/guidance
+    title: Guidance, JSON, tools (using outlines)
  title: Conceptual Guides
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@ -23,7 +23,7 @@ You can simply install `huggingface-hub` package with pip.
 pip install huggingface-hub
 ```

-Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. 
+Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.

 ```python
 from huggingface_hub import InferenceClient
@ -83,8 +83,8 @@ Gradio is a Python library that helps you build web applications for your machin
 pip install huggingface-hub gradio
 ```

-Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). 
- 
+Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
+
 ```python
 import gradio as gr
 from huggingface_hub import InferenceClient
@ -110,30 +110,30 @@ gr.ChatInterface(
 ).queue().launch()
 ```

-The UI looks like this 👇 
+The UI looks like this 👇

 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
    />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
    />
 </div>

-You can try the demo directly here 👇 
+You can try the demo directly here 👇

 <div class="block dark:hidden">
-	<iframe 
+	<iframe
        src="https://merve-gradio-tgi-2.hf.space?__theme=light"
        width="850"
        height="750"
    ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
        src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
        width="850"
        height="750"
@ -152,4 +152,4 @@ You can read more about how to customize a `ChatInterface` [here](https://www.gr

 ## API documentation

-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference). 
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
--- a/docs/source/basic_tutorials/gated_model_access.md
+++ b/docs/source/basic_tutorials/gated_model_access.md
@ -19,6 +19,6 @@ docker run --gpus all \
    --shm-size 1g \
    -e HUGGING_FACE_HUB_TOKEN=$token \
    -p 8080:80 \
-    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.2 \
+    -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 \
    --model-id $model
 ```
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@ -60,12 +60,21 @@ Options:
          [env: QUANTIZE=]

          Possible values:
-          - awq:              4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=awq. Should replace GPTQ models whereever possible because of the better latency
-          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from https://github.com/NetEase-FuXi/EETQ.git
-          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq. text-generation-inference will use exllama (faster) kernels whereever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
+          - awq:              4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
+          - eetq:             8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
+          - gptq:             4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
          - bitsandbytes:     Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half, but it is known that the model will be much slower to run than the native f16
          - bitsandbytes-nf4: Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x, but it is known that the model will be much slower to run than the native f16
          - bitsandbytes-fp4: Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better perplexity performance for you model
+          - fp8:              [FP8](https://developer.nvidia.com/blog/nvidia-arm-and-intel-publish-fp8-specification-for-standardization-as-an-interchange-format-for-ai/) (e4m3) works on H100 and above This dtype has native ops should be the fastest if available. This is currently not the fastest because of local unpacking + padding to satisfy matrix multiplication limitations
+
+```
+## SPECULATE
+```shell
+      --speculate <SPECULATE>
+          The number of input_ids to speculate on If using a medusa model, the heads will be picked up automatically Other wise, it will use n-gram speculation which is relatively free in terms of compute, but the speedup heavily depends on the task
+          
+          [env: SPECULATE=]

 ```
 ## DTYPE
@ -120,23 +129,29 @@ Options:
          [env: MAX_TOP_N_TOKENS=]
          [default: 5]

+```
+## MAX_INPUT_TOKENS
+```shell
+      --max-input-tokens <MAX_INPUT_TOKENS>
+          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle. Default to min(max_position_embeddings - 1, 4095)
+          
+          [env: MAX_INPUT_TOKENS=]
+
 ```
 ## MAX_INPUT_LENGTH
 ```shell
      --max-input-length <MAX_INPUT_LENGTH>
-          This is the maximum allowed input length (expressed in number of tokens) for users. The larger this value, the longer prompt users can send which can impact the overall memory required to handle the load. Please note that some models have a finite range of sequence they can handle
+          Legacy version of [`Args::max_input_tokens`]
          
          [env: MAX_INPUT_LENGTH=]
-          [default: 1024]

 ```
 ## MAX_TOTAL_TOKENS
 ```shell
      --max-total-tokens <MAX_TOTAL_TOKENS>
-          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be
+          This is the most important value to set as it defines the "memory budget" of running clients requests. Clients will send input sequences and ask to generate `max_new_tokens` on top. with a value of `1512` users can send either a prompt of `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for `1511` max_new_tokens. The larger this value, the larger amount each request will be in your RAM and the less effective batching can be. Default to min(max_position_embeddings, 4096)
          
          [env: MAX_TOTAL_TOKENS=]
-          [default: 2048]

 ```
 ## WAITING_SERVED_RATIO
@ -153,10 +168,9 @@ Options:
 ## MAX_BATCH_PREFILL_TOKENS
 ```shell
      --max-batch-prefill-tokens <MAX_BATCH_PREFILL_TOKENS>
-          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent
+          Limits the number of tokens for the prefill operation. Since this operation take the most memory and is compute bound, it is interesting to limit the number of requests that can be sent. Default to `max_input_tokens + 50` to give a bit of room
          
          [env: MAX_BATCH_PREFILL_TOKENS=]
-          [default: 4096]

 ```
 ## MAX_BATCH_TOTAL_TOKENS
@ -189,6 +203,22 @@ Options:
          [env: MAX_WAITING_TOKENS=]
          [default: 20]

+```
+## MAX_BATCH_SIZE
+```shell
+      --max-batch-size <MAX_BATCH_SIZE>
+          Enforce a maximum number of requests per batch Specific flag for hardware targets that do not support unpadded inference
+          
+          [env: MAX_BATCH_SIZE=]
+
+```
+## CUDA_GRAPHS
+```shell
+      --cuda-graphs <CUDA_GRAPHS>
+          Specify the batch sizes to compute cuda graphs for. Use "0" to disable. Default = "1,2,4,8,16,32"
+          
+          [env: CUDA_GRAPHS=]
+
 ```
 ## HOSTNAME
 ```shell
@ -346,6 +376,22 @@ Options:
          
          [env: NGROK_EDGE=]

+```
+## TOKENIZER_CONFIG_PATH
+```shell
+      --tokenizer-config-path <TOKENIZER_CONFIG_PATH>
+          The path to the tokenizer config file. This path is used to load the tokenizer configuration which may include a `chat_template`. If not provided, the default config will be used from the model hub
+          
+          [env: TOKENIZER_CONFIG_PATH=]
+
+```
+## DISABLE_GRAMMAR_SUPPORT
+```shell
+      --disable-grammar-support
+          Disable outlines grammar constrained generation. This is a feature that allows you to generate text that follows a specific grammar
+          
+          [env: DISABLE_GRAMMAR_SUPPORT=]
+
 ```
 ## ENV
 ```shell
--- a/docs/source/basic_tutorials/non_core_models.md
+++ b/docs/source/basic_tutorials/non_core_models.md
@ -2,19 +2,19 @@

 TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.

-You can serve these models using the same Docker command-line invocation as with fully supported models 👇 
+You can serve these models using the same Docker command-line invocation as with fully supported models 👇

 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
 ```

-If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇 
+If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇

 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
 ```

-Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇 
+Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇

 ```bash
 # Make sure your model is in the $volume directory
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@ -1,6 +1,6 @@
 # Preparing the Model

-Text Generation Inference improves the model in several aspects. 
+Text Generation Inference improves the model in several aspects.

 ## Quantization

@ -9,7 +9,7 @@ TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsan

 ## RoPE Scaling

-RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension. 
+RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.

 <Tip>

@ -19,4 +19,4 @@ We recommend using `dynamic` RoPE scaling.

 ## Safetensors

-[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format. 
+[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.
--- a/docs/source/basic_tutorials/safety.md
+++ b/docs/source/basic_tutorials/safety.md
@ -0,0 +1,31 @@
+# Model safety.
+
+[Pytorch uses pickle](https://pytorch.org/docs/master/generated/torch.load.html) by default meaning that for quite a long while
+*Every* model using that format is potentially executing unintended code while purely loading the model.
+
+There is a big red warning on Python's page for pickle [link](https://docs.python.org/3/library/pickle.html) but for quite a while
+this was ignored by the community. Now that AI/ML is getting used much more ubiquitously we need to switch away from this format.
+
+HuggingFace is leading the effort here by creating a new format which contains pure data ([safetensors](https://github.com/huggingface/safetensors))
+and moving slowly but surely all the libs to make use of it by default.
+The move is intentionnally slow in order to make breaking changes as little impact as possible on users throughout.
+
+
+# TGI 2.0
+
+Since the release of TGI 2.0, we take the opportunity of this major version increase to break backward compatibility for these pytorch
+models (since they are a huge security risk for anyone deploying them).
+
+
+From now on, TGI will not convert automatically pickle files without having `--trust-remote-code` flag or `TRUST_REMOTE_CODE=true` in the environment variables.
+This flag is already used for community defined inference code, and is therefore quite representative of the level of confidence you are giving the model providers.
+
+
+If you want to use a model that uses pickle, but you still do not want to trust the authors entirely we recommend making a convertion on our space made for that.
+
+https://huggingface.co/spaces/safetensors/convert
+
+This space will create a PR on the original model, which you are use directly regardless of merge status from the original authors. Just use
+```
+docker run .... --revision refs/pr/#ID # Or use REVISION=refs/pr/#ID in the environment
+```
--- a/docs/source/basic_tutorials/using_cli.md
+++ b/docs/source/basic_tutorials/using_cli.md
@ -1,30 +1,30 @@
 # Using TGI CLI

-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](./installation#install-cli).
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).

-`text-generation-server` lets you download the model with `download-weights` command like below 👇 
+`text-generation-server` lets you download the model with `download-weights` command like below 👇

 ```bash
 text-generation-server download-weights MODEL_HUB_ID
 ```

-You can also use it to quantize models like below 👇 
+You can also use it to quantize models like below 👇

 ```bash
-text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR 
+text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
 ```

-You can use `text-generation-launcher` to serve models. 
+You can use `text-generation-launcher` to serve models.

 ```bash
 text-generation-launcher --model-id MODEL_HUB_ID --port 8080
 ```

-There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running 
+There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running

 ```bash
 text-generation-launcher --help
-``` 
+```

 You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).

--- a/docs/source/conceptual/flash_attention.md
+++ b/docs/source/conceptual/flash_attention.md
@ -1,12 +1,11 @@
 # Flash Attention

-Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference. 
+Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference.

-Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back. 
+Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back.

 ![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png)

 It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix.

 You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135).
-
--- a/docs/source/conceptual/guidance.md
+++ b/docs/source/conceptual/guidance.md
@ -0,0 +1,419 @@
+# Guidance
+
+Text Generation Inference (TGI) now supports [JSON and regex grammars](#grammar-and-constraints) and [tools and functions](#tools-and-functions) to help developer guide LLM responses to fit their needs.
+
+These feature are available starting from version `1.4.3`. They are accessible via the [text_generation](https://pypi.org/project/text-generation/) library and is compatible with OpenAI's client libraries. The following guide will walk you through the new features and how to use them!
+
+## Quick Start
+
+Before we jump into the deep end, ensure your system is using TGI version `1.4.3` or later to access all the features we're about to explore in this guide.
+
+If you're not up to date, grab the latest version and let's get started!
+
+## Table of Contents 📚
+
+### Grammar and Constraints
+
+- [The Grammar Parameter](#the-grammar-parameter): Shape your AI's responses with precision.
+- [Constrain with Pydantic](#constrain-with-pydantic): Define a grammar using Pydantic models.
+- [JSON Schema Integration](#json-schema-integration): Fine grain control over your requests via JSON schema.
+- [Using the client](#using-the-client): Use TGI's client libraries to shape the AI's responses.
+
+### Tools and Functions
+
+- [The Tools Parameter](#the-tools-parameter): Enhance the AI's capabilities with predefined functions.
+- [Via the client](#text-generation-inference-client): Use TGI's client libraries to interact with the Messages API and Tool functions.
+- [OpenAI integration](#openai-integration): Use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+
+## Grammar and Constraints 🛣️
+
+### The Grammar Parameter
+
+In TGI `1.4.3`, we've introduced the grammar parameter, which allows you to specify the format of the response you want from the AI. This is a game-changer for those who need precise control over the AI's output.
+
+Using curl, you can make a request to TGI's Messages API with the grammar parameter. This is the most primitive way to interact with the API and using [Pydantic](#constrain-with-pydantic) is recommended for ease of use and readability.
+
+```json
+curl localhost:3000/generate \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "inputs": "I saw a puppy a cat and a raccoon during my bike ride in the park",
+    "parameters": {
+        "repetition_penalty": 1.3,
+        "grammar": {
+            "type": "json",
+            "value": {
+                "properties": {
+                    "location": {
+                        "type": "string"
+                    },
+                    "activity": {
+                        "type": "string"
+                    },
+                    "animals_seen": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 5
+                    },
+                    "animals": {
+                        "type": "array",
+                        "items": {
+                            "type": "string"
+                        }
+                    }
+                },
+                "required": ["location", "activity", "animals_seen", "animals"]
+            }
+        }
+    }
+}'
+// {"generated_text":"{ \n\n\"activity\": \"biking\",\n\"animals\": [\"puppy\",\"cat\",\"raccoon\"],\n\"animals_seen\": 3,\n\"location\": \"park\"\n}"}
+
+```
+
+A grammar can be defined using Pydantic models, JSON schema, or regular expressions. The AI will then generate a response that conforms to the specified grammar.
+
+> Note: A grammar must compile to a intermediate representation to constrain the output. Grammar compilation is a computationally expensive and may take a few seconds to complete on the first request. Subsequent requests will use the cached grammar and will be much faster.
+
+### Constrain with Pydantic
+
+Pydantic is a powerful library for data validation and settings management. It's the perfect tool for crafting the a specific response format.
+
+Using Pydantic models we can define a similar grammar as the previous example in a shorter and more readable way.
+
+```python
+import requests
+from pydantic import BaseModel, conint
+from typing import List
+
+class Animals(BaseModel):
+    location: str
+    activity: str
+    animals_seen: conint(ge=1, le=5)  # Constrained integer type
+    animals: List[str]
+
+prompt = "convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park"
+
+data = {
+    "inputs": prompt,
+    "parameters": {
+        "repetition_penalty": 1.3,
+        "grammar": {
+            "type": "json",
+            "value": Animals.schema()
+        }
+    }
+}
+
+headers = {
+    "Content-Type": "application/json",
+}
+
+response = requests.post(
+    'http://127.0.0.1:3000/generate',
+    headers=headers,
+    json=data
+)
+print(response.json())
+# {'generated_text': '{ "activity": "bike riding", "animals": ["puppy","cat","raccoon"],"animals_seen": 3, "location":"park" }'}
+
+```
+
+### JSON Schema Integration
+
+If Pydantic's not your style, go raw with direct JSON Schema integration. It's like having a conversation with the AI in its own language. This is simliar to the first example but with programmatic control.
+
+```python
+import requests
+
+json_schema = {
+    "properties": {
+        "location": {
+            "type": "string"
+        },
+        "activity": {
+            "type": "string"
+        },
+        "animals_seen": {
+            "type": "integer",
+            "minimum": 1,
+            "maximum": 5
+        },
+        "animals": {
+            "type": "array",
+            "items": {
+                "type": "string"
+            }
+        }
+    },
+    "required": ["location", "activity", "animals_seen", "animals"]
+}
+
+data = {
+    "inputs": "[INST]convert to JSON: I saw a puppy a cat and a raccoon during my bike ride in the park [/INST]",
+    "parameters": {
+        "max_new_tokens": 200,
+        "repetition_penalty": 1.3,
+        "grammar": {
+            "type": "json",
+            "value": json_schema
+        }
+    }
+}
+
+headers = {
+    "Content-Type": "application/json",
+}
+
+response = requests.post(
+    'http://127.0.0.1:3000/generate',
+    headers=headers,
+    json=data
+)
+print(response.json())
+# {'generated_text': '{\n"activity": "biking",\n"animals": ["puppy","cat","raccoon"]\n  , "animals_seen": 3,\n   "location":"park"}'}
+
+```
+
+### Using the client
+
+TGI provides a client library to that make it easy to send requests with all of the parameters we've discussed above. Here's an example of how to use the client to send a request with a grammar parameter.
+
+```python
+from text_generation import AsyncClient
+from text_generation.types import GrammarType
+
+# NOTE: tools defined above and removed for brevity
+
+# Define an async function to encapsulate the async operation
+async def main():
+    client = AsyncClient(base_url="http://localhost:3000")
+
+    # Use 'await' to wait for the async method 'chat' to complete
+    response = await client.generate(
+        "Whats Googles DNS",
+        max_new_tokens=10,
+        decoder_input_details=True,
+        seed=1,
+        grammar={
+            "type": GrammarType.Regex,
+            "value": "((25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
+        },
+    )
+
+    # Once the response is received, you can process it
+    print(response.generated_text)
+
+# Ensure the main async function is run in the event loop
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+
+# 118.8.0.84
+
+```
+
+## Tools and Functions 🛠️
+
+### The Tools Parameter
+
+In addition to the grammar parameter, we've also introduced a set of tools and functions to help you get the most out of the Messages API.
+
+Tools are a set of user defined functions that can be used in tandem with the chat functionality to enhance the AI's capabilities. You can use these tools to perform a variety of tasks, such as data manipulation, formatting, and more.
+
+Functions, similar to grammar are defined as JSON schema and can be passed as part of the parameters to the Messages API.
+
+```json
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -H 'Content-Type: application/json' \
+    -d '{
+    "model": "tgi",
+    "messages": [
+        {
+            "role": "user",
+            "content": "What is the weather like in New York?"
+        }
+    ],
+    "tools": [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA"
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location."
+                        }
+                    },
+                    "required": ["location", "format"]
+                }
+            }
+        }
+    ],
+    "tool_choice": "get_current_weather"
+}'
+// {"id":"","object":"text_completion","created":1709051640,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":19,"total_tokens":176}}
+```
+
+<details>
+  <summary>Tools used in example below</summary>
+
+  ```python
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location.",
+                        },
+                    },
+                    "required": ["location", "format"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_n_day_weather_forecast",
+                "description": "Get an N-day weather forecast",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "format": {
+                            "type": "string",
+                            "enum": ["celsius", "fahrenheit"],
+                            "description": "The temperature unit to use. Infer this from the users location.",
+                        },
+                        "num_days": {
+                            "type": "integer",
+                            "description": "The number of days to forecast",
+                        },
+                    },
+                    "required": ["location", "format", "num_days"],
+                },
+            },
+        }
+    ]
+  ```
+
+</details>
+
+### Text Generation Inference Client
+
+TGI provides a client library to interact with the Messages API and Tool functions. The client library is available in both synchronous and asynchronous versions.
+
+```python
+from text_generation import AsyncClient
+
+# NOTE: tools defined above and removed for brevity
+
+# Define an async function to encapsulate the async operation
+async def main():
+    client = AsyncClient(base_url="http://localhost:3000")
+
+    # Use 'await' to wait for the async method 'chat' to complete
+    response = await client.chat(
+        max_tokens=100,
+        seed=1,
+        tools=tools,
+        presence_penalty=-1.1,
+        messages=[
+            {
+                "role": "system",
+                "content": "You're a helpful assistant! Answer the users question best you can.",
+            },
+            {
+                "role": "user",
+                "content": "What is the weather like in Brooklyn, New York?",
+            },
+        ],
+    )
+
+    # Once the response is received, you can process it
+    print(response.choices[0].message.tool_calls)
+
+# Ensure the main async function is run in the event loop
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+
+# {"id":"","object":"text_completion","created":1709051942,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-native","choices":[{"index":0,"message":{"role":"assistant","tool_calls":{"id":0,"type":"function","function":{"description":null,"name":"tools","parameters":{"format":"celsius","location":"New York"}}}},"logprobs":null,"finish_reason":"eos_token"}],"usage":{"prompt_tokens":157,"completion_tokens":20,"total_tokens":177}}
+
+```
+
+### OpenAI integration
+
+TGI exposes an OpenAI-compatible API, which means you can use OpenAI's client libraries to interact with TGI's Messages API and Tool functions.
+
+However there are some minor differences in the API, for example `tool_choice="auto"` will ALWAYS choose the tool for you. This is different from OpenAI's API where `tool_choice="auto"` will choose a tool if the model thinks it's necessary.
+
+```python
+from openai import OpenAI
+
+# Initialize the client, pointing it to one of the available models
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="_",
+)
+
+# NOTE: tools defined above and removed for brevity
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {
+            "role": "system",
+            "content": "Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.",
+        },
+        {
+            "role": "user",
+            "content": "What's the weather like the next 3 days in San Francisco, CA?",
+        },
+    ],
+    tools=tools,
+    tool_choice="auto",  # tool selected by model
+    max_tokens=500,
+)
+
+
+called = chat_completion.choices[0].message.tool_calls
+print(called)
+# {
+#     "id": 0,
+#     "type": "function",
+#     "function": {
+#         "description": None,
+#         "name": "tools",
+#         "parameters": {
+#             "format": "celsius",
+#             "location": "San Francisco, CA",
+#             "num_days": 3,
+#         },
+#     },
+# }
+```
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@ -4,20 +4,20 @@ TGI offers GPTQ and bits-and-bytes quantization to quantize large language model

 ## Quantization with GPTQ

-GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇 
+GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇

 Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):

 $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$


-TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 
+TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇

 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
 ```

-Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. 
+Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.

 To quantize a given model using GPTQ with a calibration dataset, simply run

@ -41,7 +41,7 @@ You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.

 bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.

-8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much. 
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇

 ```bash
@ -50,7 +50,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf

 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.

-In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇

 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
--- a/docs/source/conceptual/safetensors.md
+++ b/docs/source/conceptual/safetensors.md
@ -1,7 +1,7 @@
 # Safetensors

-Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries). 
+Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).

-TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format. 
+TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.

-You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
+You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
--- a/docs/source/conceptual/speculation.md
+++ b/docs/source/conceptual/speculation.md
@ -0,0 +1,48 @@
+## Speculation
+
+Speculative decoding, assisted generation, Medusa, and others are a few different names for the same idea.
+The idea is to generate tokens *before* the large model actually runs, and only *check* if those tokens where valid.
+
+So you are making *more* computations on your LLM, but if you are correct you produce 1, 2, 3 etc.. tokens on a single LLM pass. Since LLMs are usually memory bound (and not compute bound), provided your guesses are correct enough, this is a 2-3x faster inference (It can be much more for code oriented tasks for instance).
+
+You can check a more [detailed explanation](https://huggingface.co/blog/assisted-generation).
+
+Text-generation inference supports 2 main speculative methods:
+
+- Medusa
+- N-gram
+
+
+### Medusa
+
+
+Medusa is a [simple method](https://arxiv.org/abs/2401.10774) to create many tokens in a single pass using fine-tuned LM heads in addition to your existing models.
+
+
+You can check a few existing  fine-tunes for popular models:
+
+- [text-generation-inference/gemma-7b-it-medusa](https://huggingface.co/text-generation-inference/gemma-7b-it-medusa)
+- [text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa](https://huggingface.co/text-generation-inference/Mixtral-8x7B-Instruct-v0.1-medusa)
+- [text-generation-inference/Mistral-7B-Instruct-v0.2-medusa](https://huggingface.co/text-generation-inference/Mistral-7B-Instruct-v0.2-medusa)
+
+
+In order to create your own medusa heads for your own finetune, you should check own the original medusa repo. [https://github.com/FasterDecoding/Medusa](https://github.com/FasterDecoding/Medusa)
+
+
+In order to use medusa models in TGI, simply point to a medusa enabled model, and everything will load automatically.
+
+
+### N-gram
+
+
+If you don't have a medusa model, or don't have the resource to fine-tune, you can try to use `n-gram`.
+Ngram works by trying to find in the previous sequence existing tokens that match, and use those as speculation.
+
+This is an extremely simple method, which works best for code, or highly repetitive text. This might not be beneficial, if the speculation misses too much.
+
+
+In order to enable n-gram speculation simply use
+
+`--speculate 2` in your flags.
+
+[Details about the flag](https://huggingface.co/docs/text-generation-inference/basic_tutorials/launcher#speculate)
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@ -5,12 +5,12 @@
 Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.

 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"
    />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"
    />
 </div>
@ -25,14 +25,14 @@ With token streaming, the server can start returning the tokens one by one befor
 For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below.

 <div class="block dark:hidden">
-	<iframe 
+	<iframe
        src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
        width="850"
        height="350"
    ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
        src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
        width="850"
        height="350"
@ -43,7 +43,7 @@ For example, a system can generate 100 tokens per second. If the system generate

 ### Streaming with Python

-To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response. 
+To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response.

 ```python
 from huggingface_hub import InferenceClient
@ -116,7 +116,7 @@ curl -N 127.0.0.1:8080/generate_stream \
 First, we need to install the `@huggingface/inference` library.
 `npm install @huggingface/inference`

-If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's 
+If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's

 We can create a `HfInferenceEndpoint` providing our endpoint URL and credential.

@ -129,7 +129,7 @@ const hf = new HfInferenceEndpoint('https://YOUR_ENDPOINT.endpoints.huggingface.
 const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips'

 const stream = hf.textGenerationStream({ inputs: prompt })
-for await (const r of stream) { 
+for await (const r of stream) {
  // yield the generated token
  process.stdout.write(r.token.text)
 }
--- a/docs/source/conceptual/tensor_parallelism.md
+++ b/docs/source/conceptual/tensor_parallelism.md
@ -1,6 +1,6 @@
 # Tensor Parallelism

-Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇 
+Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇

 ![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)

--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -4,7 +4,7 @@ This section explains how to install the CLI tool as well as installing TGI from

 ## Install CLI

-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. 
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters.

 To install the CLI, you need to first clone the TGI repository and then run `make`.

@ -23,7 +23,7 @@ BUILD_EXTENSIONS=True make install

 Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**.

-Text Generation Inference is available on pypi, conda and GitHub. 
+Text Generation Inference is available on pypi, conda and GitHub.

 To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using conda:
--- a/docs/source/messages_api.md
+++ b/docs/source/messages_api.md
@ -0,0 +1,175 @@
+# Messages API
+
+Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
+
+#### Table of Contents
+
+- [Making a Request](#making-a-request)
+- [Streaming](#streaming)
+- [Synchronous](#synchronous)
+- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
+- [Cloud Providers](#cloud-providers)
+  - [Amazon SageMaker](#amazon-sagemaker)
+
+## Making a Request
+
+You can make a request to TGI's Messages API using `curl`. Here's an example:
+
+```bash
+curl localhost:3000/v1/chat/completions \
+    -X POST \
+    -d '{
+  "model": "tgi",
+  "messages": [
+    {
+      "role": "system",
+      "content": "You are a helpful assistant."
+    },
+    {
+      "role": "user",
+      "content": "What is deep learning?"
+    }
+  ],
+  "stream": true,
+  "max_tokens": 20
+}' \
+    -H 'Content-Type: application/json'
+```
+
+## Streaming
+
+You can also use OpenAI's Python client library to make a streaming request. Here's how:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message)
+```
+
+## Synchronous
+
+If you prefer to make a synchronous request, you can do so like this:
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    base_url="http://localhost:3000/v1",
+    api_key="-"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=False
+)
+
+print(chat_completion)
+```
+
+## Hugging Face Inference Endpoints
+
+The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
+Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library:
+
+> **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key.
+
+```python
+from openai import OpenAI
+
+# init the client but point it to TGI
+client = OpenAI(
+    # replace with your endpoint url, make sure to include "v1/" at the end
+    base_url="https://vlzz10eq3fol3429.us-east-1.aws.endpoints.huggingface.cloud/v1/",
+    # replace with your API key
+    api_key="hf_XXX"
+)
+
+chat_completion = client.chat.completions.create(
+    model="tgi",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ],
+    stream=True
+)
+
+# iterate and print stream
+for message in chat_completion:
+    print(message.choices[0].delta.content, end="")
+```
+
+## Cloud Providers
+
+TGI can be deployed on various cloud providers for scalable and robust text generation. One such provider is Amazon SageMaker, which has recently added support for TGI. Here's how you can deploy TGI on Amazon SageMaker:
+
+## Amazon SageMaker
+
+To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
+
+This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+
+```python
+import json
+import sagemaker
+import boto3
+from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
+
+try:
+ role = sagemaker.get_execution_role()
+except ValueError:
+ iam = boto3.client('iam')
+ role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
+
+# Hub Model configuration. https://huggingface.co/models
+hub = {
+ 'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
+ 'SM_NUM_GPUS': json.dumps(1),
+ 'MESSAGES_API_ENABLED': True
+}
+
+# create Hugging Face Model Class
+huggingface_model = HuggingFaceModel(
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ env=hub,
+ role=role,
+)
+
+# deploy model to SageMaker Inference
+predictor = huggingface_model.deploy(
+ initial_instance_count=1,
+ instance_type="ml.g5.2xlarge",
+ container_startup_health_check_timeout=300,
+  )
+
+# send request
+predictor.predict({
+"messages": [
+        {"role": "system", "content": "You are a helpful assistant." },
+        {"role": "user", "content": "What is deep learning?"}
+    ]
+})
+```
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@ -2,23 +2,27 @@

 The easiest way of getting started is using the official Docker container. Install Docker following [their installation instructions](https://docs.docker.com/get-docker/).

-Let's say you want to deploy [Falcon-7B Instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) model with TGI. Here is an example on how to do that:
+Let's say you want to deploy [teknium/OpenHermes-2.5-Mistral-7B](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B) model with TGI. Here is an example on how to do that:

 ```bash
-model=tiiuae/falcon-7b-instruct
+model=teknium/OpenHermes-2.5-Mistral-7B
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.2 --model-id $model
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model
 ```

 <Tip warning={true}>

-To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)  . We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
-
-To use TGI on RoCm-enabled AMD GPUs (only MI210 and MI250 are tested), please use the image `ghcr.io/huggingface/text-generation-inference:1.2+rocm` instead. For details about the usage on RoCm, please refer to the [Supported Hardware section](./supported_models#supported-hardware) and [AMD documentation](https://rocm.docs.amd.com/en/latest/deploy/docker.html).
+To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.

 </Tip>

+TGI also supports ROCm-enabled AMD GPUs (only MI210 and MI250 are tested), details are available in the [Supported Hardware section](./supported_models#supported-hardware) and [AMD documentation](https://rocm.docs.amd.com/en/latest/deploy/docker.html). To launch TGI on ROCm GPUs, please use instead:
+
+```bash
+docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model
+```
+
 Once TGI is running, you can use the `generate` endpoint by doing requests. To learn more about how to query the endpoints, check the [Consuming TGI](./basic_tutorials/consuming_tgi) section, where we show examples with utility libraries and UIs. Below you can see a simple snippet to query the endpoint.


@ -49,7 +53,7 @@ print(response.json())
 ```js
 async function query() {
    const response = await fetch(
-        'http://127.0.0.1:8080/generate', 
+        'http://127.0.0.1:8080/generate',
        {
            method: 'POST',
            headers: { 'Content-Type': 'application/json'},
@ -87,7 +91,7 @@ curl 127.0.0.1:8080/generate \
 To see all possible deploy flags and options, you can use the `--help` flag. It's possible to configure the number of shards, quantization, generation parameters, and more.

 ```bash
-docker run ghcr.io/huggingface/text-generation-inference:1.2 --help
+docker run ghcr.io/huggingface/text-generation-inference:1.4 --help
 ```

 </Tip>
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@ -19,7 +19,11 @@ The following models are optimized and can be served with TGI, which uses custom
 - [MPT](https://huggingface.co/mosaicml/mpt-30b)
 - [Llama V2](https://huggingface.co/meta-llama)
 - [Code Llama](https://huggingface.co/codellama)
- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+- [Mistral](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)
+- [Phi](https://huggingface.co/microsoft/phi-2)
+- [Idefics](HuggingFaceM4/idefics-9b-instruct) (Multimodal)
+- [Llava-next](llava-hf/llava-v1.6-mistral-7b-hf) (Multimodal)

 If the above list lacks the model you would like to serve, depending on the model's pipeline type, you can try to initialize and serve the model anyways to see how well it performs, but performance isn't guaranteed for non-optimized models:

@ -39,9 +43,13 @@ text-generation-launcher --model-id <PATH-TO-LOCAL-BLOOM>

 ## Supported Hardware

-TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 11.8+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed. 
+TGI optimized models are supported on NVIDIA [A100](https://www.nvidia.com/en-us/data-center/a100/), [A10G](https://www.nvidia.com/en-us/data-center/products/a10-gpu/) and [T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) GPUs with CUDA 12.2+. Note that you have to install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to use it. For other NVIDIA GPUs, continuous batching will still apply, but some operations like flash attention and paged attention will not be executed.

-TGI also has support of RoCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention and flash attention v2 support. The following features are missing from the RoCm version of TGI: quantization and flash [layer norm kernel](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm).
+TGI also has support of ROCm-enabled AMD Instinct MI210 and MI250 GPUs, with paged attention, GPTQ quantization, flash attention v2 support. The following features are currently not supported in the ROCm version of TGI, and the supported may be extended in the future:
+* Loading [AWQ](https://huggingface.co/docs/transformers/quantization#awq) checkpoints.
+* Flash [layer norm kernel](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm)
+* Kernel for sliding window attention (Mistral)

 TGI is also supported on the following AI hardware accelerators:
- *Habana first-gen Gaudi and Gaudi2:* check out this [example](https://github.com/huggingface/optimum-habana/tree/main/text-generation-inference) how to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
+- *Habana first-gen Gaudi and Gaudi2:* check out this [repository](https://github.com/huggingface/tgi-gaudi) to serve models with TGI on Gaudi and Gaudi2 with [Optimum Habana](https://huggingface.co/docs/optimum/habana/index)
+* *AWS Inferentia2:* check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -16,7 +16,17 @@ from syrupy.extensions.json import JSONSnapshotExtension
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError

 from text_generation import AsyncClient
-from text_generation.types import Response, Details, InputToken, Token, BestOfSequence
+from text_generation.types import (
+    Response,
+    Details,
+    InputToken,
+    Token,
+    BestOfSequence,
+    Grammar,
+    ChatComplete,
+    ChatCompletionChunk,
+    ChatCompletionComplete,
+)

 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
 HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
@ -25,6 +35,7 @@ DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")

 class ResponseComparator(JSONSnapshotExtension):
    rtol = 0.2
+
    def serialize(
        self,
        data,
@ -32,8 +43,16 @@ class ResponseComparator(JSONSnapshotExtension):
        exclude=None,
        matcher=None,
    ):
+        if (
+            isinstance(data, Response)
+            or isinstance(data, ChatComplete)
+            or isinstance(data, ChatCompletionChunk)
+            or isinstance(data, ChatCompletionComplete)
+        ):
+            data = data.model_dump()
+
        if isinstance(data, List):
-            data = [d.dict() for d in data]
+            data = [d.model_dump() for d in data]

        data = self._filter(
            data=data, depth=0, path=(), exclude=exclude, matcher=matcher
@ -48,6 +67,15 @@ class ResponseComparator(JSONSnapshotExtension):
    ) -> bool:
        def convert_data(data):
            data = json.loads(data)
+            if isinstance(data, Dict) and "choices" in data:
+                choices = data["choices"]
+                if (
+                    isinstance(choices, List)
+                    and len(choices) >= 1
+                    and "delta" in choices[0]
+                ):
+                    return ChatCompletionChunk(**data)
+                return ChatComplete(**data)

            if isinstance(data, Dict):
                return Response(**data)
@ -69,7 +97,9 @@ class ResponseComparator(JSONSnapshotExtension):
                    prefill_token.id == other.id
                    and prefill_token.text == other.text
                    and (
-                        math.isclose(prefill_token.logprob, other.logprob, rel_tol=self.rtol)
+                        math.isclose(
+                            prefill_token.logprob, other.logprob, rel_tol=self.rtol
+                        )
                        if prefill_token.logprob is not None
                        else prefill_token.logprob == other.logprob
                    )
@ -131,6 +161,16 @@ class ResponseComparator(JSONSnapshotExtension):
                )
            )

+        def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
+            return (
+                response.choices[0].message.content == other.choices[0].message.content
+            )
+
+        def eq_chat_complete_chunk(
+            response: ChatCompletionChunk, other: ChatCompletionChunk
+        ) -> bool:
+            return response.choices[0].delta.content == other.choices[0].delta.content
+
        def eq_response(response: Response, other: Response) -> bool:
            return response.generated_text == other.generated_text and eq_details(
                response.details, other.details
@ -144,6 +184,19 @@ class ResponseComparator(JSONSnapshotExtension):
        if not isinstance(snapshot_data, List):
            snapshot_data = [snapshot_data]

+        if isinstance(serialized_data[0], ChatComplete):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
+        if isinstance(serialized_data[0], ChatCompletionChunk):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [
+                    eq_chat_complete_chunk(r, o)
+                    for r, o in zip(serialized_data, snapshot_data)
+                ]
+            )
+
        return len(snapshot_data) == len(serialized_data) and all(
            [eq_response(r, o) for r, o in zip(serialized_data, snapshot_data)]
        )
@ -153,6 +206,7 @@ class GenerousResponseComparator(ResponseComparator):
    # Needed for GPTQ with exllama which has serious numerical fluctuations.
    rtol = 0.75

+
 class LauncherHandle:
    def __init__(self, port: int):
        self.client = AsyncClient(f"http://localhost:{port}")
@ -198,6 +252,7 @@ class ProcessLauncherHandle(LauncherHandle):
 def response_snapshot(snapshot):
    return snapshot.use_extension(ResponseComparator)

+
@pytest.fixture
 def generous_response_snapshot(snapshot):
    return snapshot.use_extension(GenerousResponseComparator)
@ -219,7 +274,11 @@ def launcher(event_loop):
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
        use_flash_attention: bool = True,
-        dtype: Optional[str] = None
+        disable_grammar_support: bool = False,
+        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
+        max_input_length: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
    ):
        port = random.randint(8000, 10_000)
        master_port = random.randint(10_000, 20_000)
@ -242,6 +301,8 @@ def launcher(event_loop):

        env = os.environ

+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
        if quantize is not None:
@ -250,8 +311,17 @@ def launcher(event_loop):
        if dtype is not None:
            args.append("--dtype")
            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
+        if max_input_length:
+            args.append("--max-input-length")
+            args.append(str(max_input_length))
+        if max_total_tokens:
+            args.append("--max-total-tokens")
+            args.append(str(max_total_tokens))

        env["LOG_LEVEL"] = "info,text_generation_router=debug"

@ -282,12 +352,18 @@ def launcher(event_loop):
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
        use_flash_attention: bool = True,
-        dtype: Optional[str] = None
+        disable_grammar_support: bool = False,
+        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
+        max_input_length: Optional[int] = None,
+        max_total_tokens: Optional[int] = None,
    ):
        port = random.randint(8000, 10_000)

        args = ["--model-id", model_id, "--env"]

+        if disable_grammar_support:
+            args.append("--disable-grammar-support")
        if num_shard is not None:
            args.extend(["--num-shard", str(num_shard)])
        if quantize is not None:
@ -296,8 +372,17 @@ def launcher(event_loop):
        if dtype is not None:
            args.append("--dtype")
            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
+        if max_input_length:
+            args.append("--max-input-length")
+            args.append(str(max_input_length))
+        if max_total_tokens:
+            args.append("--max-total-tokens")
+            args.append(str(max_total_tokens))

        client = docker.from_env()

@ -312,7 +397,9 @@ def launcher(event_loop):

        gpu_count = num_shard if num_shard is not None else 1

-        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+        env = {
+            "LOG_LEVEL": "info,text_generation_router=debug",
+        }
        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"

@ -335,7 +422,7 @@ def launcher(event_loop):
            ],
            volumes=volumes,
            ports={"80/tcp": port},
-            shm_size="1G"
+            shm_size="1G",
        )

        yield ContainerLauncherHandle(client, container.name, port)
@ -362,11 +449,22 @@ def launcher(event_loop):
@pytest.fixture(scope="module")
 def generate_load():
    async def generate_load_inner(
-        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+        client: AsyncClient,
+        prompt: str,
+        max_new_tokens: int,
+        n: int,
+        seed: Optional[int] = None,
+        grammar: Optional[Grammar] = None,
+        stop_sequences: Optional[List[str]] = None,
    ) -> List[Response]:
        futures = [
            client.generate(
-                prompt, max_new_tokens=max_new_tokens, decoder_input_details=True
+                prompt,
+                max_new_tokens=max_new_tokens,
+                decoder_input_details=True,
+                seed=seed,
+                grammar=grammar,
+                stop_sequences=stop_sequences,
            )
            for _ in range(n)
        ]
--- a/integration-tests/images/chicken_on_money.png
+++ b/integration-tests/images/chicken_on_money.png
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -10.0,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 1736,
+        "logprob": -2.09375,
+        "special": false,
+        "text": " form"
+      },
+      {
+        "id": 109,
+        "logprob": -1.8671875,
+        "special": false,
+        "text": "\n\n"
+      },
+      {
+        "id": 651,
+        "logprob": -2.4375,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 2121,
+        "logprob": -1.8203125,
+        "special": false,
+        "text": " test"
+      },
+      {
+        "id": 3853,
+        "logprob": -0.23242188,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 1736,
+        "logprob": -0.08544922,
+        "special": false,
+        "text": " form"
+      },
+      {
+        "id": 603,
+        "logprob": -0.9375,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 1671,
+        "logprob": -1.671875,
+        "special": false,
+        "text": " used"
+      },
+      {
+        "id": 577,
+        "logprob": -0.40429688,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3853,
+        "logprob": -1.1875,
+        "special": false,
+        "text": " request"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " form\n\nThe test request form is used to request"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_all_params.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2,
+        "logprob": null,
+        "text": "<bos>"
+      },
+      {
+        "id": 2015,
+        "logprob": -10.0,
+        "text": "Test"
+      },
+      {
+        "id": 3853,
+        "logprob": -10.875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 7539,
+        "logprob": -0.73046875,
+        "special": false,
+        "text": " forms"
+      },
+      {
+        "id": 708,
+        "logprob": 0.0,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 671,
+        "logprob": -1.703125,
+        "special": false,
+        "text": " an"
+      },
+      {
+        "id": 8727,
+        "logprob": 0.0,
+        "special": false,
+        "text": " essential"
+      },
+      {
+        "id": 1702,
+        "logprob": 0.0,
+        "special": false,
+        "text": " part"
+      },
+      {
+        "id": 576,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 573,
+        "logprob": 0.0,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 11859,
+        "logprob": -1.6953125,
+        "special": false,
+        "text": " lab"
+      },
+      {
+        "id": 2185,
+        "logprob": -1.3125,
+        "special": false,
+        "text": " process"
+      },
+      {
+        "id": 578,
+        "logprob": -1.5,
+        "special": false,
+        "text": " and"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request forms are an essential part of the lab process and"
+}
--- a/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_load.json
+++ b/integration-tests/models/snapshots/test_flash_gemma/test_flash_gemma_load.json
@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2,
+          "logprob": null,
+          "text": "<bos>"
+        },
+        {
+          "id": 2015,
+          "logprob": -10.0,
+          "text": "Test"
+        },
+        {
+          "id": 3853,
+          "logprob": -10.875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 1736,
+          "logprob": -2.09375,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 109,
+          "logprob": -1.9140625,
+          "special": false,
+          "text": "\n\n"
+        },
+        {
+          "id": 651,
+          "logprob": -2.453125,
+          "special": false,
+          "text": "The"
+        },
+        {
+          "id": 2121,
+          "logprob": -1.8984375,
+          "special": false,
+          "text": " test"
+        },
+        {
+          "id": 3853,
+          "logprob": -0.23535156,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 1736,
+          "logprob": -0.091308594,
+          "special": false,
+          "text": " form"
+        },
+        {
+          "id": 603,
+          "logprob": -0.96875,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 1671,
+          "logprob": -1.6484375,
+          "special": false,
+          "text": " used"
+        },
+        {
+          "id": 577,
+          "logprob": -0.43164062,
+          "special": false,
+          "text": " to"
+        },
+        {
+          "id": 3853,
+          "logprob": -1.2421875,
+          "special": false,
+          "text": " request"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": " form\n\nThe test request form is used to request"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar.json
+++ b/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -13.90625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -12.328125,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -2.0566406,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -1.5253906,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 29902,
+        "logprob": -2.7578125,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 4966,
+        "logprob": -1.9033203,
+        "special": false,
+        "text": " hope"
+      },
+      {
+        "id": 445,
+        "logprob": -0.5019531,
+        "special": false,
+        "text": " this"
+      },
+      {
+        "id": 6911,
+        "logprob": -0.21264648,
+        "special": false,
+        "text": " helps"
+      },
+      {
+        "id": 29991,
+        "logprob": -0.5991211,
+        "special": false,
+        "text": "!"
+      },
+      {
+        "id": 2803,
+        "logprob": -0.37475586,
+        "special": false,
+        "text": " Let"
+      },
+      {
+        "id": 592,
+        "logprob": -0.018463135,
+        "special": false,
+        "text": " me"
+      },
+      {
+        "id": 1073,
+        "logprob": -0.0008597374,
+        "special": false,
+        "text": " know"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nI hope this helps! Let me know"
+}
--- a/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_json.json
+++ b/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_json.json
@ -0,0 +1,274 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 30,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 5235,
+        "logprob": -10.0625,
+        "text": "info"
+      },
+      {
+        "id": 29901,
+        "logprob": -3.2324219,
+        "text": ":"
+      },
+      {
+        "id": 13260,
+        "logprob": -10.625,
+        "text": "dav"
+      },
+      {
+        "id": 333,
+        "logprob": -0.08276367,
+        "text": "id"
+      },
+      {
+        "id": 8753,
+        "logprob": -7.5273438,
+        "text": "hol"
+      },
+      {
+        "id": 17559,
+        "logprob": -3.8476562,
+        "text": "tz"
+      },
+      {
+        "id": 763,
+        "logprob": -10.140625,
+        "text": "like"
+      },
+      {
+        "id": 10697,
+        "logprob": -10.1953125,
+        "text": "trees"
+      },
+      {
+        "id": 322,
+        "logprob": -2.5742188,
+        "text": "and"
+      },
+      {
+        "id": 756,
+        "logprob": -7.4882812,
+        "text": "has"
+      },
+      {
+        "id": 1023,
+        "logprob": -5.0507812,
+        "text": "two"
+      },
+      {
+        "id": 274,
+        "logprob": -5.3164062,
+        "text": "c"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.6694336,
+        "text": "ats"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.9995117,
+        "text": "."
+      },
+      {
+        "id": 29871,
+        "logprob": -4.2421875,
+        "text": ""
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 6377,
+        "logprob": -0.14916992,
+        "special": false,
+        "text": "{\""
+      },
+      {
+        "id": 29888,
+        "logprob": -0.13598633,
+        "special": false,
+        "text": "f"
+      },
+      {
+        "id": 12935,
+        "logprob": -0.017669678,
+        "special": false,
+        "text": "irs"
+      },
+      {
+        "id": 29873,
+        "logprob": -0.00085639954,
+        "special": false,
+        "text": "t"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.0054016113,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.13549805,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 19504,
+        "logprob": -0.8852539,
+        "special": false,
+        "text": "David"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.16394043,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29882,
+        "logprob": -0.08862305,
+        "special": false,
+        "text": "h"
+      },
+      {
+        "id": 711,
+        "logprob": -0.66259766,
+        "special": false,
+        "text": "ob"
+      },
+      {
+        "id": 1609,
+        "logprob": -5.51939e-05,
+        "special": false,
+        "text": "by"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.23120117,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29911,
+        "logprob": -2.3730469,
+        "special": false,
+        "text": "T"
+      },
+      {
+        "id": 11003,
+        "logprob": -0.032104492,
+        "special": false,
+        "text": "rees"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.22021484,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 4230,
+        "logprob": -0.06726074,
+        "special": false,
+        "text": "last"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.003501892,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.0045661926,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29950,
+        "logprob": -0.12512207,
+        "special": false,
+        "text": "H"
+      },
+      {
+        "id": 14339,
+        "logprob": -0.009552002,
+        "special": false,
+        "text": "olt"
+      },
+      {
+        "id": 29920,
+        "logprob": -0.00042438507,
+        "special": false,
+        "text": "z"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.11651611,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29876,
+        "logprob": -0.29736328,
+        "special": false,
+        "text": "n"
+      },
+      {
+        "id": 398,
+        "logprob": -0.003030777,
+        "special": false,
+        "text": "um"
+      },
+      {
+        "id": 29907,
+        "logprob": -0.3774414,
+        "special": false,
+        "text": "C"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.0003130436,
+        "special": false,
+        "text": "ats"
+      },
+      {
+        "id": 1115,
+        "logprob": -0.0021514893,
+        "special": false,
+        "text": "\":"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.071899414,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29913,
+        "logprob": -0.018997192,
+        "special": false,
+        "text": "}"
+      },
+      {
+        "id": 2,
+        "logprob": 0.0,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "{\"firstName\":\"David\",\"hobby\":\"Trees\",\"lastName\":\"Holtz\",\"numCats\":2}"
+}
--- a/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_load.json
+++ b/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_load.json
@ -0,0 +1,478 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.0332031,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04257202,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4785156,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32495117,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7709961,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33740234,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.00995636,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64208984,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.4970703,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46533203,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5336914,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022735596,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.03125,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04244995,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4863281,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32714844,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7685547,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33666992,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.01008606,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64160156,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.5,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46557617,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5341797,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022907257,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.0332031,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04257202,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4785156,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32495117,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7709961,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33740234,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.00995636,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64208984,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.4970703,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46533203,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5336914,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022735596,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1024,
+          "logprob": -10.578125,
+          "text": "name"
+        },
+        {
+          "id": 29901,
+          "logprob": -3.0332031,
+          "text": ":"
+        },
+        {
+          "id": 13260,
+          "logprob": -9.171875,
+          "text": "dav"
+        },
+        {
+          "id": 333,
+          "logprob": -0.04257202,
+          "text": "id"
+        },
+        {
+          "id": 29889,
+          "logprob": -2.4785156,
+          "text": "."
+        },
+        {
+          "id": 4876,
+          "logprob": -10.7890625,
+          "text": "email"
+        },
+        {
+          "id": 29901,
+          "logprob": -0.32495117,
+          "text": ":"
+        },
+        {
+          "id": 259,
+          "logprob": -9.4921875,
+          "text": " "
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -0.7709961,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 29906,
+          "logprob": -0.33740234,
+          "special": false,
+          "text": "2"
+        },
+        {
+          "id": 29941,
+          "logprob": -0.00995636,
+          "special": false,
+          "text": "3"
+        },
+        {
+          "id": 29946,
+          "logprob": -0.64208984,
+          "special": false,
+          "text": "4"
+        },
+        {
+          "id": 29945,
+          "logprob": -0.4970703,
+          "special": false,
+          "text": "5"
+        },
+        {
+          "id": 29953,
+          "logprob": -0.46533203,
+          "special": false,
+          "text": "6"
+        },
+        {
+          "id": 29992,
+          "logprob": -0.5336914,
+          "special": false,
+          "text": "@"
+        },
+        {
+          "id": 21980,
+          "logprob": -0.5361328,
+          "special": false,
+          "text": "gmail"
+        },
+        {
+          "id": 29889,
+          "logprob": -0.00088739395,
+          "special": false,
+          "text": "."
+        },
+        {
+          "id": 510,
+          "logprob": -0.0022735596,
+          "special": false,
+          "text": "com"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "123456@gmail.com"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_regex.json
+++ b/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_regex.json
@ -0,0 +1,109 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 806,
+        "logprob": -11.890625,
+        "text": "Wh"
+      },
+      {
+        "id": 1446,
+        "logprob": -3.6699219,
+        "text": "ats"
+      },
+      {
+        "id": 2921,
+        "logprob": -7.8203125,
+        "text": "Go"
+      },
+      {
+        "id": 468,
+        "logprob": -8.0703125,
+        "text": "og"
+      },
+      {
+        "id": 793,
+        "logprob": -2.1875,
+        "text": "les"
+      },
+      {
+        "id": 16332,
+        "logprob": -9.7109375,
+        "text": "DNS"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29946,
+        "logprob": -1.4765625,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.9199219,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29889,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 29896,
+        "logprob": -1.1367188,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29889,
+        "logprob": -1.4648438,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 29896,
+        "logprob": -0.40722656,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.17419434,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 29896,
+        "logprob": -0.20251465,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29900,
+        "logprob": -1.5527344,
+        "special": false,
+        "text": "0"
+      },
+      {
+        "id": 29896,
+        "logprob": -1.3710938,
+        "special": false,
+        "text": "1"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "42.1.1.101"
+}
--- a/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_single_load_instance.json
+++ b/integration-tests/models/snapshots/test_flash_grammar_llama/test_flash_llama_grammar_single_load_instance.json
@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29896,
+        "logprob": -0.7685547,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.33666992,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29941,
+        "logprob": -0.009979248,
+        "special": false,
+        "text": "3"
+      },
+      {
+        "id": 29946,
+        "logprob": -0.64208984,
+        "special": false,
+        "text": "4"
+      },
+      {
+        "id": 29945,
+        "logprob": -0.4970703,
+        "special": false,
+        "text": "5"
+      },
+      {
+        "id": 29953,
+        "logprob": -0.46533203,
+        "special": false,
+        "text": "6"
+      },
+      {
+        "id": 29992,
+        "logprob": -0.5336914,
+        "special": false,
+        "text": "@"
+      },
+      {
+        "id": 21980,
+        "logprob": -0.53759766,
+        "special": false,
+        "text": "gmail"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.0008878708,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 510,
+        "logprob": -0.002275467,
+        "special": false,
+        "text": "com"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "123456@gmail.com"
+}
--- a/integration-tests/models/snapshots/test_flash_llama_gptq/test_flash_llama_gptq.json
+++ b/integration-tests/models/snapshots/test_flash_llama_gptq/test_flash_llama_gptq.json
@ -11,78 +11,79 @@
      },
      {
        "id": 4321,
-        "logprob": -9.59375,
+        "logprob": -9.7890625,
        "text": "Test"
      },
      {
        "id": 2009,
-        "logprob": -9.6640625,
+        "logprob": -9.625,
        "text": "request"
      }
    ],
    "seed": null,
    "tokens": [
-      {
-        "id": 29918,
-        "logprob": -2.3867188,
-        "special": false,
-        "text": "_"
-      },
-      {
-        "id": 5338,
-        "logprob": -2.8183594,
-        "special": false,
-        "text": "uri"
-      },
      {
        "id": 13,
-        "logprob": -1.6367188,
+        "logprob": -2.3359375,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3057,
-        "logprob": -1.0527344,
+        "logprob": -1.8779297,
        "special": false,
        "text": "Test"
      },
      {
        "id": 2009,
-        "logprob": -0.6542969,
+        "logprob": -1.2744141,
        "special": false,
        "text": " request"
      },
-      {
-        "id": 29918,
-        "logprob": -0.056121826,
-        "special": false,
-        "text": "_"
-      },
-      {
-        "id": 5338,
-        "logprob": -0.01600647,
-        "special": false,
-        "text": "uri"
-      },
      {
        "id": 13,
-        "logprob": -0.87939453,
+        "logprob": -1.6933594,
        "special": false,
        "text": "\n"
      },
      {
        "id": 3057,
-        "logprob": -0.7529297,
+        "logprob": -1.4648438,
        "special": false,
        "text": "Test"
      },
      {
        "id": 2009,
-        "logprob": -0.2980957,
+        "logprob": -0.15600586,
        "special": false,
        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.8027344,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 3057,
+        "logprob": -0.23022461,
+        "special": false,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -0.0069885254,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 13,
+        "logprob": -0.02218628,
+        "special": false,
+        "text": "\n"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": "_uri\nTest request_uri\nTest request"
+  "generated_text": "\nTest request\nTest request\nTest request\n"
 }
--- a/integration-tests/models/snapshots/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json
@ -11,12 +11,12 @@
      },
      {
        "id": 4321,
-        "logprob": -9.6015625,
+        "logprob": -9.84375,
        "text": "Test"
      },
      {
        "id": 2009,
-        "logprob": -9.6640625,
+        "logprob": -9.6015625,
        "text": "request"
      }
    ],
@ -24,13 +24,13 @@
    "tokens": [
      {
        "id": 29899,
-        "logprob": -1.1640625,
+        "logprob": -1.5625,
        "special": false,
        "text": "-"
      },
      {
        "id": 1454,
-        "logprob": -0.07543945,
+        "logprob": -0.20410156,
        "special": false,
        "text": "for"
      },
@ -54,19 +54,19 @@
      },
      {
        "id": 396,
-        "logprob": -0.2956543,
+        "logprob": -0.27685547,
        "special": false,
        "text": " #"
      },
      {
        "id": 29906,
-        "logprob": -0.52734375,
+        "logprob": -0.4970703,
        "special": false,
        "text": "2"
      },
      {
        "id": 29900,
-        "logprob": -0.6899414,
+        "logprob": -0.80615234,
        "special": false,
        "text": "0"
      },
@ -77,12 +77,13 @@
        "text": "1"
      },
      {
-        "id": 29946,
-        "logprob": -1.5068359,
+        "id": 29955,
+        "logprob": -1.0751953,
        "special": false,
-        "text": "4"
+        "text": "7"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": "Test request-for-comment: #2014"
+  "generated_text": "Test request-for-comment: #2017"
 }
--- a/integration-tests/models/snapshots/test_flash_llama_gptq/test_flash_llama_gptq_load.json
+++ b/integration-tests/models/snapshots/test_flash_llama_gptq/test_flash_llama_gptq_load.json
@ -12,80 +12,81 @@
        },
        {
          "id": 4321,
-          "logprob": -9.6015625,
+          "logprob": -9.828125,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -9.671875,
+          "logprob": -9.609375,
          "text": "request"
        }
      ],
      "seed": null,
      "tokens": [
-        {
-          "id": 29918,
-          "logprob": -2.3828125,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.8105469,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -1.6396484,
+          "logprob": -2.3300781,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -1.0546875,
+          "logprob": -1.8740234,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.6513672,
+          "logprob": -1.2646484,
          "special": false,
          "text": " request"
        },
-        {
-          "id": 29918,
-          "logprob": -0.056365967,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -0.016082764,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -0.87841797,
+          "logprob": -1.7158203,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -0.7548828,
+          "logprob": -1.4667969,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.29711914,
+          "logprob": -0.15344238,
          "special": false,
          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.81591797,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22973633,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007045746,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021957397,
+          "special": false,
+          "text": "\n"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\nTest request\nTest request\nTest request\n"
  },
  {
    "details": {
@ -100,80 +101,81 @@
        },
        {
          "id": 4321,
-          "logprob": -9.6015625,
+          "logprob": -9.84375,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -9.6640625,
+          "logprob": -9.59375,
          "text": "request"
        }
      ],
      "seed": null,
      "tokens": [
-        {
-          "id": 29918,
-          "logprob": -2.3828125,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.828125,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -1.6386719,
+          "logprob": -2.3378906,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -1.0527344,
+          "logprob": -1.8779297,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.6542969,
+          "logprob": -1.2636719,
          "special": false,
          "text": " request"
        },
-        {
-          "id": 29918,
-          "logprob": -0.055877686,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -0.016021729,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -0.8769531,
+          "logprob": -1.6992188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -0.7583008,
+          "logprob": -1.4589844,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.29833984,
+          "logprob": -0.15344238,
          "special": false,
          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.79052734,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22937012,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.007041931,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.022140503,
+          "special": false,
+          "text": "\n"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\nTest request\nTest request\nTest request\n"
  },
  {
    "details": {
@ -188,80 +190,81 @@
        },
        {
          "id": 4321,
-          "logprob": -9.6015625,
+          "logprob": -9.84375,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -9.671875,
+          "logprob": -9.609375,
          "text": "request"
        }
      ],
      "seed": null,
      "tokens": [
-        {
-          "id": 29918,
-          "logprob": -2.3847656,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.8144531,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -1.6396484,
+          "logprob": -2.3261719,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -1.0527344,
+          "logprob": -1.8730469,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.65478516,
+          "logprob": -1.2587891,
          "special": false,
          "text": " request"
        },
-        {
-          "id": 29918,
-          "logprob": -0.056243896,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -0.016143799,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -0.8808594,
+          "logprob": -1.6894531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -0.75341797,
+          "logprob": -1.46875,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.2956543,
+          "logprob": -0.1541748,
          "special": false,
          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.80322266,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22912598,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.0070495605,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021606445,
+          "special": false,
+          "text": "\n"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\nTest request\nTest request\nTest request\n"
  },
  {
    "details": {
@ -276,79 +279,80 @@
        },
        {
          "id": 4321,
-          "logprob": -9.6015625,
+          "logprob": -9.84375,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -9.6640625,
+          "logprob": -9.6015625,
          "text": "request"
        }
      ],
      "seed": null,
      "tokens": [
-        {
-          "id": 29918,
-          "logprob": -2.3769531,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -2.8183594,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -1.6396484,
+          "logprob": -2.3320312,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -1.0546875,
+          "logprob": -1.875,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.65478516,
+          "logprob": -1.2646484,
          "special": false,
          "text": " request"
        },
-        {
-          "id": 29918,
-          "logprob": -0.05557251,
-          "special": false,
-          "text": "_"
-        },
-        {
-          "id": 5338,
-          "logprob": -0.01612854,
-          "special": false,
-          "text": "uri"
-        },
        {
          "id": 13,
-          "logprob": -0.8730469,
+          "logprob": -1.6884766,
          "special": false,
          "text": "\n"
        },
        {
          "id": 3057,
-          "logprob": -0.7519531,
+          "logprob": -1.4589844,
          "special": false,
          "text": "Test"
        },
        {
          "id": 2009,
-          "logprob": -0.29785156,
+          "logprob": -0.15185547,
          "special": false,
          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.79833984,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 3057,
+          "logprob": -0.22827148,
+          "special": false,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -0.006996155,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 13,
+          "logprob": -0.021560669,
+          "special": false,
+          "text": "\n"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "_uri\nTest request_uri\nTest request"
+    "generated_text": "\nTest request\nTest request\nTest request\n"
  }
 ]
--- a/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_all_params.json
@ -0,0 +1,98 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 338,
+        "logprob": -10.0078125,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -15.515625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -2.8847656,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -4.140625,
+        "text": "?"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.1582031,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2772,
+        "logprob": -0.23083496,
+        "special": false,
+        "text": "De"
+      },
+      {
+        "id": 1022,
+        "logprob": 0.0,
+        "special": false,
+        "text": "ep"
+      },
+      {
+        "id": 6509,
+        "logprob": 0.0,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 29892,
+        "logprob": -0.61816406,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 607,
+        "logprob": -0.7089844,
+        "special": false,
+        "text": " which"
+      },
+      {
+        "id": 508,
+        "logprob": -1.7724609,
+        "special": false,
+        "text": " can"
+      },
+      {
+        "id": 367,
+        "logprob": 0.0,
+        "special": false,
+        "text": " be"
+      },
+      {
+        "id": 5545,
+        "logprob": 0.0,
+        "special": false,
+        "text": " considered"
+      },
+      {
+        "id": 408,
+        "logprob": -0.3869629,
+        "special": false,
+        "text": " as"
+      }
+    ]
+  },
+  "generated_text": "What is Deep Learning?\nDeep learning, which can be considered as"
+}
--- a/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_load.json
+++ b/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_load.json
@ -0,0 +1,414 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2753906,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.48046875,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1845703,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.5727539,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.00010967255,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.04510498,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.00020992756,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.0046539307,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025844574,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2724609,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.47729492,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1826172,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.56689453,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.000108003616,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.044433594,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.0002104044,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.004711151,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025892258,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2724609,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.47729492,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1826172,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.56689453,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.000108003616,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.044433594,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.0002104044,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.004711151,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025892258,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 1724,
+          "logprob": -10.734375,
+          "text": "What"
+        },
+        {
+          "id": 338,
+          "logprob": -1.5488281,
+          "text": "is"
+        },
+        {
+          "id": 21784,
+          "logprob": -9.2890625,
+          "text": "Deep"
+        },
+        {
+          "id": 29257,
+          "logprob": -1.2724609,
+          "text": "Learning"
+        },
+        {
+          "id": 29973,
+          "logprob": -0.47729492,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 13,
+          "logprob": -1.1826172,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2772,
+          "logprob": -0.56689453,
+          "special": false,
+          "text": "De"
+        },
+        {
+          "id": 1022,
+          "logprob": -0.000108003616,
+          "special": false,
+          "text": "ep"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.1239624,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 338,
+          "logprob": -0.044433594,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 263,
+          "logprob": -0.018295288,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 11306,
+          "logprob": -0.45922852,
+          "special": false,
+          "text": " subset"
+        },
+        {
+          "id": 310,
+          "logprob": -0.0002104044,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 4933,
+          "logprob": -0.004711151,
+          "special": false,
+          "text": " machine"
+        },
+        {
+          "id": 6509,
+          "logprob": -0.00025892258,
+          "special": false,
+          "text": " learning"
+        }
+      ]
+    },
+    "generated_text": "\nDeep learning is a subset of machine learning"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_simple.json
+++ b/integration-tests/models/snapshots/test_flash_medusa/test_flash_medusa_simple.json
@ -0,0 +1,103 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 1724,
+        "logprob": -10.734375,
+        "text": "What"
+      },
+      {
+        "id": 338,
+        "logprob": -1.5488281,
+        "text": "is"
+      },
+      {
+        "id": 21784,
+        "logprob": -9.2890625,
+        "text": "Deep"
+      },
+      {
+        "id": 29257,
+        "logprob": -1.2753906,
+        "text": "Learning"
+      },
+      {
+        "id": 29973,
+        "logprob": -0.48046875,
+        "text": "?"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.1845703,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2772,
+        "logprob": -0.5727539,
+        "special": false,
+        "text": "De"
+      },
+      {
+        "id": 1022,
+        "logprob": -0.000108122826,
+        "special": false,
+        "text": "ep"
+      },
+      {
+        "id": 6509,
+        "logprob": -0.1239624,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 338,
+        "logprob": -0.044433594,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 263,
+        "logprob": -0.01852417,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 11306,
+        "logprob": -0.45922852,
+        "special": false,
+        "text": " subset"
+      },
+      {
+        "id": 310,
+        "logprob": -0.0002104044,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 4933,
+        "logprob": -0.004787445,
+        "special": false,
+        "text": " machine"
+      },
+      {
+        "id": 6509,
+        "logprob": -0.00026226044,
+        "special": false,
+        "text": " learning"
+      }
+    ]
+  },
+  "generated_text": "\nDeep learning is a subset of machine learning"
+}
--- a/integration-tests/models/snapshots/test_flash_phi/test_flash_phi.json
+++ b/integration-tests/models/snapshots/test_flash_phi/test_flash_phi.json
@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 14402,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 2581,
+        "logprob": -11.6171875,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 25,
+        "logprob": -2.3203125,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 1391,
+        "logprob": -0.98779297,
+        "special": false,
+        "text": " {"
+      },
+      {
+        "id": 25927,
+        "logprob": -0.76660156,
+        "special": false,
+        "text": "request"
+      },
+      {
+        "id": 92,
+        "logprob": -0.7246094,
+        "special": false,
+        "text": "}"
+      },
+      {
+        "id": 4943,
+        "logprob": -0.41333008,
+        "special": false,
+        "text": "\")"
+      },
+      {
+        "id": 198,
+        "logprob": -0.11785889,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 50280,
+        "logprob": -0.97265625,
+        "special": false,
+        "text": "        "
+      },
+      {
+        "id": 26209,
+        "logprob": -1.4414062,
+        "special": false,
+        "text": "response"
+      },
+      {
+        "id": 796,
+        "logprob": -0.0569458,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 2116,
+        "logprob": -1.1533203,
+        "special": false,
+        "text": " self"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": ": {request}\")\n        response = self"
+}
--- a/integration-tests/models/snapshots/test_flash_phi/test_flash_phi_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_phi/test_flash_phi_all_params.json
@ -0,0 +1,60 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "stop_sequence",
+    "generated_tokens": 6,
+    "prefill": [
+      {
+        "id": 14402,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 2581,
+        "logprob": -11.6171875,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 284,
+        "logprob": -0.19421387,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 3758,
+        "logprob": -0.62597656,
+        "special": false,
+        "text": " send"
+      },
+      {
+        "id": 1366,
+        "logprob": -0.87060547,
+        "special": false,
+        "text": " data"
+      },
+      {
+        "id": 625,
+        "logprob": -0.88427734,
+        "special": false,
+        "text": " over"
+      },
+      {
+        "id": 257,
+        "logprob": -1.0830078,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 3127,
+        "logprob": -1.9462891,
+        "special": false,
+        "text": " network"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request to send data over a network"
+}
--- a/integration-tests/models/snapshots/test_flash_phi/test_flash_phi_load.json
+++ b/integration-tests/models/snapshots/test_flash_phi/test_flash_phi_load.json
@ -0,0 +1,338 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 14402,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 2581,
+          "logprob": -11.6171875,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 25,
+          "logprob": -2.3203125,
+          "special": false,
+          "text": ":"
+        },
+        {
+          "id": 1391,
+          "logprob": -0.98779297,
+          "special": false,
+          "text": " {"
+        },
+        {
+          "id": 25927,
+          "logprob": -0.7729492,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 92,
+          "logprob": -0.7241211,
+          "special": false,
+          "text": "}"
+        },
+        {
+          "id": 4943,
+          "logprob": -0.4091797,
+          "special": false,
+          "text": "\")"
+        },
+        {
+          "id": 198,
+          "logprob": -0.119018555,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 50280,
+          "logprob": -0.9707031,
+          "special": false,
+          "text": "        "
+        },
+        {
+          "id": 26209,
+          "logprob": -1.4414062,
+          "special": false,
+          "text": "response"
+        },
+        {
+          "id": 796,
+          "logprob": -0.056854248,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 2116,
+          "logprob": -1.1533203,
+          "special": false,
+          "text": " self"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": ": {request}\")\n        response = self"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_qwen2/test_flash_qwen2.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2/test_flash_qwen2.json
@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2271,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 1681,
+        "logprob": -8.8515625,
+        "text": " request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 198,
+        "logprob": -2.9023438,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2,
+        "logprob": -2.9160156,
+        "special": false,
+        "text": "#"
+      },
+      {
+        "id": 4230,
+        "logprob": -3.1035156,
+        "special": false,
+        "text": " Create"
+      },
+      {
+        "id": 264,
+        "logprob": -1.1025391,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1681,
+        "logprob": -1.6914062,
+        "special": false,
+        "text": " request"
+      },
+      {
+        "id": 198,
+        "logprob": -1.1953125,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 2035,
+        "logprob": -1.3203125,
+        "special": false,
+        "text": "request"
+      },
+      {
+        "id": 284,
+        "logprob": -0.13537598,
+        "special": false,
+        "text": " ="
+      },
+      {
+        "id": 7388,
+        "logprob": -1.2402344,
+        "special": false,
+        "text": " requests"
+      },
+      {
+        "id": 670,
+        "logprob": -0.2775879,
+        "special": false,
+        "text": ".get"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n# Create a request\nrequest = requests.get"
+}
--- a/integration-tests/models/snapshots/test_flash_qwen2/test_flash_qwen2_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2/test_flash_qwen2_all_params.json
@ -0,0 +1,84 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2271,
+        "logprob": null,
+        "text": "Test"
+      },
+      {
+        "id": 1681,
+        "logprob": -8.8515625,
+        "text": " request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 311,
+        "logprob": -1.4277344,
+        "special": false,
+        "text": " to"
+      },
+      {
+        "id": 279,
+        "logprob": -0.65478516,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 2473,
+        "logprob": -1.8300781,
+        "special": false,
+        "text": " service"
+      },
+      {
+        "id": 382,
+        "logprob": -0.75,
+        "special": false,
+        "text": ".\n\n"
+      },
+      {
+        "id": 286,
+        "logprob": -0.11621094,
+        "special": false,
+        "text": "       "
+      },
+      {
+        "id": 549,
+        "logprob": 0.0,
+        "special": false,
+        "text": " :"
+      },
+      {
+        "id": 689,
+        "logprob": -0.48608398,
+        "special": false,
+        "text": "return"
+      },
+      {
+        "id": 25,
+        "logprob": 0.0,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 5949,
+        "logprob": -0.5756836,
+        "special": false,
+        "text": " Response"
+      },
+      {
+        "id": 504,
+        "logprob": -0.24499512,
+        "special": false,
+        "text": " from"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request to the service.\n\n        :return: Response from"
+}
--- a/integration-tests/models/snapshots/test_flash_qwen2/test_flash_qwen2_load.json
+++ b/integration-tests/models/snapshots/test_flash_qwen2/test_flash_qwen2_load.json
@ -0,0 +1,338 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 2271,
+          "logprob": null,
+          "text": "Test"
+        },
+        {
+          "id": 1681,
+          "logprob": -8.8515625,
+          "text": " request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 198,
+          "logprob": -2.9023438,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2,
+          "logprob": -2.9140625,
+          "special": false,
+          "text": "#"
+        },
+        {
+          "id": 4230,
+          "logprob": -3.1054688,
+          "special": false,
+          "text": " Create"
+        },
+        {
+          "id": 264,
+          "logprob": -1.0966797,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 1681,
+          "logprob": -1.6914062,
+          "special": false,
+          "text": " request"
+        },
+        {
+          "id": 198,
+          "logprob": -1.1923828,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 2035,
+          "logprob": -1.3193359,
+          "special": false,
+          "text": "request"
+        },
+        {
+          "id": 284,
+          "logprob": -0.13586426,
+          "special": false,
+          "text": " ="
+        },
+        {
+          "id": 7388,
+          "logprob": -1.2412109,
+          "special": false,
+          "text": " requests"
+        },
+        {
+          "id": 670,
+          "logprob": -0.2775879,
+          "special": false,
+          "text": ".get"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n# Create a request\nrequest = requests.get"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2.json
@ -0,0 +1,94 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 610,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": -5.2617188,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": -0.38476562,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": -7.640625,
+        "text": "hello"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2284,
+        "logprob": -0.92626953,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 303,
+        "logprob": -0.40844727,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": -0.27905273,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": -0.6118164,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": -0.68652344,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10914,
+        "logprob": -1.4619141,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 16013,
+        "logprob": -0.7993164,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": -0.63134766,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": -0.23278809,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": -1.2294922,
+        "special": false,
+        "text": "def"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+}
--- a/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2_default_params.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2_default_params.json
@ -0,0 +1,394 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 60,
+    "prefill": [
+      {
+        "id": 610,
+        "logprob": null,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": -5.2617188,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": -0.38476562,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": -7.640625,
+        "text": "hello"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 2284,
+        "logprob": -0.296875,
+        "special": false,
+        "text": "():"
+      },
+      {
+        "id": 303,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": 0.0,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": -0.28125,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 10914,
+        "logprob": -0.79248047,
+        "special": false,
+        "text": " World"
+      },
+      {
+        "id": 16013,
+        "logprob": -0.61816406,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": -0.0619812,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": -0.4091797,
+        "special": false,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": 0.0,
+        "special": false,
+        "text": "hello"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 444,
+        "logprob": -0.21655273,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 45,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 731,
+        "logprob": 0.0,
+        "special": false,
+        "text": "):"
+      },
+      {
+        "id": 303,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": 0.0,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 332,
+        "logprob": -0.034698486,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 655,
+        "logprob": 0.0,
+        "special": false,
+        "text": " name"
+      },
+      {
+        "id": 494,
+        "logprob": -0.20141602,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 16013,
+        "logprob": 0.0,
+        "special": false,
+        "text": "!\")"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 222,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 610,
+        "logprob": 0.0,
+        "special": false,
+        "text": "def"
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 7670,
+        "logprob": 0.0,
+        "special": false,
+        "text": "hello"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 100,
+        "logprob": 0.0,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 400,
+        "logprob": 0.0,
+        "special": false,
+        "text": "age"
+      },
+      {
+        "id": 45,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 444,
+        "logprob": 0.0,
+        "special": false,
+        "text": "name"
+      },
+      {
+        "id": 49,
+        "logprob": 0.0,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 11505,
+        "logprob": 0.0,
+        "special": false,
+        "text": " age"
+      },
+      {
+        "id": 731,
+        "logprob": 0.0,
+        "special": false,
+        "text": "):"
+      },
+      {
+        "id": 303,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n   "
+      },
+      {
+        "id": 1489,
+        "logprob": 0.0,
+        "special": false,
+        "text": " print"
+      },
+      {
+        "id": 459,
+        "logprob": 0.0,
+        "special": false,
+        "text": "(\""
+      },
+      {
+        "id": 8302,
+        "logprob": 0.0,
+        "special": false,
+        "text": "Hello"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 655,
+        "logprob": 0.0,
+        "special": false,
+        "text": " name"
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 3021,
+        "logprob": -0.5761719,
+        "special": false,
+        "text": " \","
+      },
+      {
+        "id": 863,
+        "logprob": 0.0,
+        "special": false,
+        "text": " you"
+      },
+      {
+        "id": 904,
+        "logprob": 0.0,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 332,
+        "logprob": 0.0,
+        "special": false,
+        "text": " \""
+      },
+      {
+        "id": 494,
+        "logprob": 0.0,
+        "special": false,
+        "text": " +"
+      },
+      {
+        "id": 615,
+        "logprob": 0.0,
+        "special": false,
+        "text": " str"
+      },
+      {
+        "id": 45,
+        "logprob": 0.0,
+        "special": false,
+        "text": "("
+      },
+      {
+        "id": 400,
+        "logprob": 0.0,
+        "special": false,
+        "text": "age"
+      },
+      {
+        "id": 46,
+        "logprob": 0.0,
+        "special": false,
+        "text": ")"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "():\n    print(\"Hello World!\")\n\ndef print_hello_name(name):\n    print(\"Hello \" + name + \"!\")\n\ndef print_hello_name_age(name, age):\n    print(\"Hello \" + name + \", you are \" + str(age)"
+}
--- a/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2_load.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder2/test_flash_starcoder2_load.json
@ -0,0 +1,378 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 610,
+          "logprob": null,
+          "text": "def"
+        },
+        {
+          "id": 1489,
+          "logprob": -5.2617188,
+          "text": " print"
+        },
+        {
+          "id": 100,
+          "logprob": -0.38476562,
+          "text": "_"
+        },
+        {
+          "id": 7670,
+          "logprob": -7.640625,
+          "text": "hello"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 2284,
+          "logprob": -0.92626953,
+          "special": false,
+          "text": "():"
+        },
+        {
+          "id": 303,
+          "logprob": -0.40722656,
+          "special": false,
+          "text": "\n   "
+        },
+        {
+          "id": 1489,
+          "logprob": -0.27954102,
+          "special": false,
+          "text": " print"
+        },
+        {
+          "id": 459,
+          "logprob": -0.6142578,
+          "special": false,
+          "text": "(\""
+        },
+        {
+          "id": 8302,
+          "logprob": -0.68310547,
+          "special": false,
+          "text": "Hello"
+        },
+        {
+          "id": 10914,
+          "logprob": -1.4570312,
+          "special": false,
+          "text": " World"
+        },
+        {
+          "id": 16013,
+          "logprob": -0.80126953,
+          "special": false,
+          "text": "!\")"
+        },
+        {
+          "id": 222,
+          "logprob": -0.6303711,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 222,
+          "logprob": -0.23327637,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 610,
+          "logprob": -1.2304688,
+          "special": false,
+          "text": "def"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "():\n    print(\"Hello World!\")\n\ndef"
+  }
+]
--- a/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
@ -1,193 +1,194 @@
 {
-  "generated_text": "\n    return sum(L) / len(L)\n\n\ndef geometric_mean(L",
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 20,
-    "seed": null,
    "prefill": [
      {
        "id": 589,
-        "text": "def",
-        "logprob": null
+        "logprob": null,
+        "text": "def"
      },
      {
        "id": 3226,
-        "text": " ge",
-        "logprob": -9.0234375
+        "logprob": -8.5859375,
+        "text": " ge"
      },
      {
        "id": 21017,
-        "text": "ometric",
-        "logprob": -9.0859375
+        "logprob": -7.5859375,
+        "text": "ometric"
      },
      {
        "id": 81,
-        "text": "_",
-        "logprob": -0.25878906
+        "logprob": -0.2668457,
+        "text": "_"
      },
      {
        "id": 6009,
-        "text": "mean",
-        "logprob": -2.2109375
+        "logprob": -1.6416016,
+        "text": "mean"
      },
      {
        "id": 26,
-        "text": "(",
-        "logprob": -0.30371094
+        "logprob": -0.22705078,
+        "text": "("
      },
      {
        "id": 62,
-        "text": "L",
-        "logprob": -5.6054688
+        "logprob": -5.2304688,
+        "text": "L"
      },
      {
        "id": 44,
-        "text": ":",
-        "logprob": -3.0722656
+        "logprob": -3.0976562,
+        "text": ":"
      },
      {
        "id": 1682,
-        "text": " List",
-        "logprob": -0.6879883
+        "logprob": -1.1044922,
+        "text": " List"
      },
      {
        "id": 77,
-        "text": "[",
-        "logprob": -0.38500977
+        "logprob": -0.14294434,
+        "text": "["
      },
      {
        "id": 1808,
-        "text": "float",
-        "logprob": -0.984375
+        "logprob": -0.32299805,
+        "text": "float"
      },
      {
        "id": 10794,
-        "text": "]):",
-        "logprob": -2.5351562
+        "logprob": -2.8164062,
+        "text": "]):"
      }
    ],
+    "seed": null,
    "tokens": [
      {
        "id": 284,
-        "text": "\n   ",
-        "logprob": -1.1738281,
-        "special": false
+        "logprob": -0.1282959,
+        "special": false,
+        "text": "\n   "
      },
      {
-        "id": 442,
-        "text": " return",
-        "logprob": -0.95947266,
-        "special": false
+        "id": 1524,
+        "logprob": -0.97998047,
+        "special": false,
+        "text": " \"\"\""
      },
      {
-        "id": 3632,
-        "text": " sum",
-        "logprob": -1.4199219,
-        "special": false
+        "id": 284,
+        "logprob": -0.7006836,
+        "special": false,
+        "text": "\n   "
      },
      {
-        "id": 26,
-        "text": "(",
-        "logprob": -0.085876465,
-        "special": false
+        "id": 14883,
+        "logprob": -2.1933594,
+        "special": false,
+        "text": " Calculate"
      },
      {
-        "id": 62,
-        "text": "L",
-        "logprob": -0.09875488,
-        "special": false
-      },
-      {
-        "id": 27,
-        "text": ")",
-        "logprob": -0.30517578,
-        "special": false
-      },
-      {
-        "id": 517,
-        "text": " /",
-        "logprob": -0.42089844,
-        "special": false
-      },
-      {
-        "id": 2069,
-        "text": " len",
-        "logprob": -0.042053223,
-        "special": false
-      },
-      {
-        "id": 26,
-        "text": "(",
-        "logprob": -0.0011806488,
-        "special": false
-      },
-      {
-        "id": 62,
-        "text": "L",
-        "logprob": -0.0005259514,
-        "special": false
-      },
-      {
-        "id": 27,
-        "text": ")",
-        "logprob": -0.0017633438,
-        "special": false
-      },
-      {
-        "id": 478,
-        "text": "\n\n",
-        "logprob": -0.69189453,
-        "special": false
-      },
-      {
-        "id": 203,
-        "text": "\n",
-        "logprob": -0.041870117,
-        "special": false
-      },
-      {
-        "id": 589,
-        "text": "def",
-        "logprob": -0.27856445,
-        "special": false
+        "id": 322,
+        "logprob": -0.2697754,
+        "special": false,
+        "text": " the"
      },
      {
        "id": 3226,
-        "text": " ge",
-        "logprob": -1.7255859,
-        "special": false
+        "logprob": -0.0836792,
+        "special": false,
+        "text": " ge"
      },
      {
        "id": 21017,
-        "text": "ometric",
-        "logprob": -0.011291504,
-        "special": false
+        "logprob": -0.018737793,
+        "special": false,
+        "text": "ometric"
      },
      {
-        "id": 81,
-        "text": "_",
-        "logprob": -0.008430481,
-        "special": false
+        "id": 5651,
+        "logprob": -0.028640747,
+        "special": false,
+        "text": " mean"
      },
      {
-        "id": 6009,
-        "text": "mean",
-        "logprob": -0.025787354,
-        "special": false
+        "id": 432,
+        "logprob": -0.29467773,
+        "special": false,
+        "text": " of"
      },
      {
-        "id": 26,
-        "text": "(",
-        "logprob": -0.073913574,
-        "special": false
+        "id": 312,
+        "logprob": -0.31518555,
+        "special": false,
+        "text": " a"
      },
      {
-        "id": 62,
-        "text": "L",
-        "logprob": -0.09967041,
-        "special": false
+        "id": 1149,
+        "logprob": -0.20605469,
+        "special": false,
+        "text": " list"
+      },
+      {
+        "id": 432,
+        "logprob": -0.23254395,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 7515,
+        "logprob": -0.4489746,
+        "special": false,
+        "text": " numbers"
+      },
+      {
+        "id": 32,
+        "logprob": -0.6044922,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 446,
+        "logprob": -0.63964844,
+        "special": false,
+        "text": "\n\n   "
+      },
+      {
+        "id": 499,
+        "logprob": -1.1953125,
+        "special": false,
+        "text": " :"
+      },
+      {
+        "id": 753,
+        "logprob": -0.03515625,
+        "special": false,
+        "text": "param"
+      },
+      {
+        "id": 498,
+        "logprob": -0.06311035,
+        "special": false,
+        "text": " L"
+      },
+      {
+        "id": 44,
+        "logprob": -0.003414154,
+        "special": false,
+        "text": ":"
+      },
+      {
+        "id": 1682,
+        "logprob": -1.3310547,
+        "special": false,
+        "text": " List"
      }
-    ]
-  }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a list of numbers.\n\n    :param L: List"
 }
--- a/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
@ -11,57 +11,57 @@
      },
      {
        "id": 3226,
-        "logprob": -9.0234375,
+        "logprob": -8.5859375,
        "text": " ge"
      },
      {
        "id": 21017,
-        "logprob": -9.09375,
+        "logprob": -7.5898438,
        "text": "ometric"
      },
      {
        "id": 81,
-        "logprob": -0.25976562,
+        "logprob": -0.26586914,
        "text": "_"
      },
      {
        "id": 6009,
-        "logprob": -2.2148438,
+        "logprob": -1.6347656,
        "text": "mean"
      },
      {
        "id": 26,
-        "logprob": -0.3010254,
+        "logprob": -0.22705078,
        "text": "("
      },
      {
        "id": 62,
-        "logprob": -5.6757812,
+        "logprob": -5.2382812,
        "text": "L"
      },
      {
        "id": 44,
-        "logprob": -3.0898438,
+        "logprob": -3.0996094,
        "text": ":"
      },
      {
        "id": 1682,
-        "logprob": -0.6791992,
+        "logprob": -1.1025391,
        "text": " List"
      },
      {
        "id": 77,
-        "logprob": -0.38891602,
+        "logprob": -0.14294434,
        "text": "["
      },
      {
        "id": 1808,
-        "logprob": -0.92041016,
+        "logprob": -0.32226562,
        "text": "float"
      },
      {
        "id": 10794,
-        "logprob": -2.5390625,
+        "logprob": -2.8164062,
        "text": "]):"
      }
    ],
@ -75,13 +75,13 @@
      },
      {
        "id": 442,
-        "logprob": 0.0,
+        "logprob": -1.3134766,
        "special": false,
        "text": " return"
      },
      {
        "id": 11665,
-        "logprob": -1.6005859,
+        "logprob": -0.10021973,
        "special": false,
        "text": " reduce"
      },
@ -129,7 +129,7 @@
      },
      {
        "id": 319,
-        "logprob": 0.0,
+        "logprob": -0.42871094,
        "special": false,
        "text": " *"
      },
@ -158,36 +158,37 @@
        "text": ")"
      },
      {
-        "id": 203,
-        "logprob": -0.11968994,
-        "special": false,
-        "text": "\n"
-      },
-      {
-        "id": 203,
+        "id": 1115,
        "logprob": 0.0,
        "special": false,
-        "text": "\n"
+        "text": " **"
      },
      {
-        "id": 589,
+        "id": 308,
        "logprob": 0.0,
        "special": false,
-        "text": "def"
+        "text": " ("
      },
      {
-        "id": 3226,
+        "id": 35,
        "logprob": 0.0,
        "special": false,
-        "text": " ge"
+        "text": "1"
      },
      {
-        "id": 21017,
+        "id": 32,
+        "logprob": -0.31323242,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 34,
        "logprob": 0.0,
        "special": false,
-        "text": "ometric"
+        "text": "0"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": "\n    return reduce(lambda x, y: x * y, L)\n\ndef geometric"
+  "generated_text": "\n    return reduce(lambda x, y: x * y, L) ** (1.0"
 }
--- a/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
+++ b/integration-tests/models/snapshots/test_flash_starcoder_gptq/test_flash_starcoder_gptq_load.json
@ -12,57 +12,57 @@
        },
        {
          "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
          "text": " ge"
        },
        {
          "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.5820312,
          "text": "ometric"
        },
        {
          "id": 81,
-          "logprob": -0.25927734,
+          "logprob": -0.26708984,
          "text": "_"
        },
        {
          "id": 6009,
-          "logprob": -2.25,
+          "logprob": -1.6386719,
          "text": "mean"
        },
        {
          "id": 26,
-          "logprob": -0.30126953,
+          "logprob": -0.22717285,
          "text": "("
        },
        {
          "id": 62,
-          "logprob": -5.7539062,
+          "logprob": -5.234375,
          "text": "L"
        },
        {
          "id": 44,
-          "logprob": -3.0878906,
+          "logprob": -3.1015625,
          "text": ":"
        },
        {
          "id": 1682,
-          "logprob": -0.6845703,
+          "logprob": -1.1083984,
          "text": " List"
        },
        {
          "id": 77,
-          "logprob": -0.3918457,
+          "logprob": -0.14294434,
          "text": "["
        },
        {
          "id": 1808,
-          "logprob": -0.8798828,
+          "logprob": -0.32592773,
          "text": "float"
        },
        {
          "id": 10794,
-          "logprob": -2.4980469,
+          "logprob": -2.8164062,
          "text": "]):"
        }
      ],
@ -70,67 +70,68 @@
      "tokens": [
        {
          "id": 284,
-          "logprob": -1.1533203,
+          "logprob": -0.12817383,
          "special": false,
          "text": "\n   "
        },
        {
-          "id": 442,
-          "logprob": -0.91796875,
+          "id": 1524,
+          "logprob": -0.9863281,
          "special": false,
-          "text": " return"
+          "text": " \"\"\""
        },
        {
-          "id": 3632,
-          "logprob": -1.3291016,
+          "id": 284,
+          "logprob": -0.7011719,
          "special": false,
-          "text": " sum"
+          "text": "\n   "
        },
        {
-          "id": 26,
-          "logprob": -0.08062744,
+          "id": 14883,
+          "logprob": -2.2050781,
          "special": false,
-          "text": "("
+          "text": " Calculate"
        },
        {
-          "id": 62,
-          "logprob": -0.097717285,
+          "id": 322,
+          "logprob": -0.2668457,
          "special": false,
-          "text": "L"
+          "text": " the"
        },
        {
-          "id": 27,
-          "logprob": -0.29003906,
+          "id": 3226,
+          "logprob": -0.08465576,
          "special": false,
-          "text": ")"
+          "text": " ge"
        },
        {
-          "id": 517,
-          "logprob": -0.34958984,
+          "id": 21017,
+          "logprob": -0.019012451,
          "special": false,
-          "text": " /"
+          "text": "ometric"
        },
        {
-          "id": 2069,
-          "logprob": -0.03829956,
+          "id": 5651,
+          "logprob": -0.028625488,
          "special": false,
-          "text": " len"
+          "text": " mean"
        },
        {
-          "id": 26,
-          "logprob": -0.0011987686,
+          "id": 432,
+          "logprob": -0.29418945,
          "special": false,
-          "text": "("
+          "text": " of"
        },
        {
-          "id": 62,
-          "logprob": -0.00050878525,
+          "id": 312,
+          "logprob": -0.3161621,
          "special": false,
-          "text": "L"
+          "text": " a"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
  },
  {
    "details": {
@ -145,57 +146,57 @@
        },
        {
          "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
          "text": " ge"
        },
        {
          "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.59375,
          "text": "ometric"
        },
        {
          "id": 81,
-          "logprob": -0.25878906,
+          "logprob": -0.26953125,
          "text": "_"
        },
        {
          "id": 6009,
-          "logprob": -2.2109375,
+          "logprob": -1.640625,
          "text": "mean"
        },
        {
          "id": 26,
-          "logprob": -0.30371094,
+          "logprob": -0.22705078,
          "text": "("
        },
        {
          "id": 62,
-          "logprob": -5.6054688,
+          "logprob": -5.234375,
          "text": "L"
        },
        {
          "id": 44,
-          "logprob": -3.0722656,
+          "logprob": -3.1132812,
          "text": ":"
        },
        {
          "id": 1682,
-          "logprob": -0.6879883,
+          "logprob": -1.1123047,
          "text": " List"
        },
        {
          "id": 77,
-          "logprob": -0.38500977,
+          "logprob": -0.14294434,
          "text": "["
        },
        {
          "id": 1808,
-          "logprob": -0.984375,
+          "logprob": -0.32299805,
          "text": "float"
        },
        {
          "id": 10794,
-          "logprob": -2.5351562,
+          "logprob": -2.8164062,
          "text": "]):"
        }
      ],
@ -203,67 +204,68 @@
      "tokens": [
        {
          "id": 284,
-          "logprob": -1.1738281,
+          "logprob": -0.12854004,
          "special": false,
          "text": "\n   "
        },
        {
-          "id": 442,
-          "logprob": -0.9584961,
+          "id": 1524,
+          "logprob": -0.9897461,
          "special": false,
-          "text": " return"
+          "text": " \"\"\""
        },
        {
-          "id": 3632,
-          "logprob": -1.4169922,
+          "id": 284,
+          "logprob": -0.69970703,
          "special": false,
-          "text": " sum"
+          "text": "\n   "
        },
        {
-          "id": 26,
-          "logprob": -0.085876465,
+          "id": 14883,
+          "logprob": -2.2050781,
          "special": false,
-          "text": "("
+          "text": " Calculate"
        },
        {
-          "id": 62,
-          "logprob": -0.0982666,
+          "id": 322,
+          "logprob": -0.2668457,
          "special": false,
-          "text": "L"
+          "text": " the"
        },
        {
-          "id": 27,
-          "logprob": -0.3022461,
+          "id": 3226,
+          "logprob": -0.08496094,
          "special": false,
-          "text": ")"
+          "text": " ge"
        },
        {
-          "id": 517,
-          "logprob": -0.40504883,
+          "id": 21017,
+          "logprob": -0.019012451,
          "special": false,
-          "text": " /"
+          "text": "ometric"
        },
        {
-          "id": 2069,
-          "logprob": -0.041656494,
+          "id": 5651,
+          "logprob": -0.029037476,
          "special": false,
-          "text": " len"
+          "text": " mean"
        },
        {
-          "id": 26,
-          "logprob": -0.0011844635,
+          "id": 432,
+          "logprob": -0.2939453,
          "special": false,
-          "text": "("
+          "text": " of"
        },
        {
-          "id": 62,
-          "logprob": -0.0005264282,
+          "id": 312,
+          "logprob": -0.31591797,
          "special": false,
-          "text": "L"
+          "text": " a"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
  },
  {
    "details": {
@ -278,57 +280,57 @@
        },
        {
          "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
          "text": " ge"
        },
        {
          "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.5859375,
          "text": "ometric"
        },
        {
          "id": 81,
-          "logprob": -0.25927734,
+          "logprob": -0.26586914,
          "text": "_"
        },
        {
          "id": 6009,
-          "logprob": -2.25,
+          "logprob": -1.6347656,
          "text": "mean"
        },
        {
          "id": 26,
-          "logprob": -0.30126953,
+          "logprob": -0.22766113,
          "text": "("
        },
        {
          "id": 62,
-          "logprob": -5.7539062,
+          "logprob": -5.2265625,
          "text": "L"
        },
        {
          "id": 44,
-          "logprob": -3.0878906,
+          "logprob": -3.0976562,
          "text": ":"
        },
        {
          "id": 1682,
-          "logprob": -0.6845703,
+          "logprob": -1.1025391,
          "text": " List"
        },
        {
          "id": 77,
-          "logprob": -0.3918457,
+          "logprob": -0.1427002,
          "text": "["
        },
        {
          "id": 1808,
-          "logprob": -0.8798828,
+          "logprob": -0.32592773,
          "text": "float"
        },
        {
          "id": 10794,
-          "logprob": -2.4980469,
+          "logprob": -2.8164062,
          "text": "]):"
        }
      ],
@ -336,67 +338,68 @@
      "tokens": [
        {
          "id": 284,
-          "logprob": -1.1533203,
+          "logprob": -0.13012695,
          "special": false,
          "text": "\n   "
        },
        {
-          "id": 442,
-          "logprob": -0.9165039,
+          "id": 1524,
+          "logprob": -0.98046875,
          "special": false,
-          "text": " return"
+          "text": " \"\"\""
        },
        {
-          "id": 3632,
-          "logprob": -1.328125,
+          "id": 284,
+          "logprob": -0.69921875,
          "special": false,
-          "text": " sum"
+          "text": "\n   "
        },
        {
-          "id": 26,
-          "logprob": -0.07946777,
+          "id": 14883,
+          "logprob": -2.1992188,
          "special": false,
-          "text": "("
+          "text": " Calculate"
        },
        {
-          "id": 62,
-          "logprob": -0.09820557,
+          "id": 322,
+          "logprob": -0.2668457,
          "special": false,
-          "text": "L"
+          "text": " the"
        },
        {
-          "id": 27,
-          "logprob": -0.28930664,
+          "id": 3226,
+          "logprob": -0.083496094,
          "special": false,
-          "text": ")"
+          "text": " ge"
        },
        {
-          "id": 517,
-          "logprob": -0.34592773,
+          "id": 21017,
+          "logprob": -0.01902771,
          "special": false,
-          "text": " /"
+          "text": "ometric"
        },
        {
-          "id": 2069,
-          "logprob": -0.038330078,
+          "id": 5651,
+          "logprob": -0.029006958,
          "special": false,
-          "text": " len"
+          "text": " mean"
        },
        {
-          "id": 26,
-          "logprob": -0.0011940002,
+          "id": 432,
+          "logprob": -0.29248047,
          "special": false,
-          "text": "("
+          "text": " of"
        },
        {
-          "id": 62,
-          "logprob": -0.00050878525,
+          "id": 312,
+          "logprob": -0.3161621,
          "special": false,
-          "text": "L"
+          "text": " a"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
  },
  {
    "details": {
@ -411,57 +414,57 @@
        },
        {
          "id": 3226,
-          "logprob": -9.0234375,
+          "logprob": -8.5859375,
          "text": " ge"
        },
        {
          "id": 21017,
-          "logprob": -9.0859375,
+          "logprob": -7.5859375,
          "text": "ometric"
        },
        {
          "id": 81,
-          "logprob": -0.25927734,
+          "logprob": -0.26904297,
          "text": "_"
        },
        {
          "id": 6009,
-          "logprob": -2.25,
+          "logprob": -1.6386719,
          "text": "mean"
        },
        {
          "id": 26,
-          "logprob": -0.30126953,
+          "logprob": -0.22705078,
          "text": "("
        },
        {
          "id": 62,
-          "logprob": -5.7539062,
+          "logprob": -5.234375,
          "text": "L"
        },
        {
          "id": 44,
-          "logprob": -3.0878906,
+          "logprob": -3.1132812,
          "text": ":"
        },
        {
          "id": 1682,
-          "logprob": -0.6845703,
+          "logprob": -1.1074219,
          "text": " List"
        },
        {
          "id": 77,
-          "logprob": -0.3918457,
+          "logprob": -0.14477539,
          "text": "["
        },
        {
          "id": 1808,
-          "logprob": -0.8798828,
+          "logprob": -0.3256836,
          "text": "float"
        },
        {
          "id": 10794,
-          "logprob": -2.4980469,
+          "logprob": -2.8027344,
          "text": "]):"
        }
      ],
@ -469,66 +472,67 @@
      "tokens": [
        {
          "id": 284,
-          "logprob": -1.1533203,
+          "logprob": -0.12915039,
          "special": false,
          "text": "\n   "
        },
        {
-          "id": 442,
-          "logprob": -0.91259766,
+          "id": 1524,
+          "logprob": -0.98535156,
          "special": false,
-          "text": " return"
+          "text": " \"\"\""
        },
        {
-          "id": 3632,
-          "logprob": -1.3251953,
+          "id": 284,
+          "logprob": -0.69921875,
          "special": false,
-          "text": " sum"
+          "text": "\n   "
        },
        {
-          "id": 26,
-          "logprob": -0.08062744,
+          "id": 14883,
+          "logprob": -2.2011719,
          "special": false,
-          "text": "("
+          "text": " Calculate"
        },
        {
-          "id": 62,
-          "logprob": -0.09906006,
+          "id": 322,
+          "logprob": -0.26708984,
          "special": false,
-          "text": "L"
+          "text": " the"
        },
        {
-          "id": 27,
-          "logprob": -0.28979492,
+          "id": 3226,
+          "logprob": -0.08502197,
          "special": false,
-          "text": ")"
+          "text": " ge"
        },
        {
-          "id": 517,
-          "logprob": -0.35958984,
+          "id": 21017,
+          "logprob": -0.019012451,
          "special": false,
-          "text": " /"
+          "text": "ometric"
        },
        {
-          "id": 2069,
-          "logprob": -0.038604736,
+          "id": 5651,
+          "logprob": -0.028625488,
          "special": false,
-          "text": " len"
+          "text": " mean"
        },
        {
-          "id": 26,
-          "logprob": -0.0011901855,
+          "id": 432,
+          "logprob": -0.29589844,
          "special": false,
-          "text": "("
+          "text": " of"
        },
        {
-          "id": 62,
-          "logprob": -0.0005078316,
+          "id": 312,
+          "logprob": -0.31591797,
          "special": false,
-          "text": "L"
+          "text": " a"
        }
-      ]
+      ],
+      "top_tokens": null
    },
-    "generated_text": "\n    return sum(L) / len(L"
+    "generated_text": "\n    \"\"\"\n    Calculate the geometric mean of a"
  }
 ]
--- a/integration-tests/models/snapshots/test_grammar_llama/test_non_flash_llama_grammar_json.json
+++ b/integration-tests/models/snapshots/test_grammar_llama/test_non_flash_llama_grammar_json.json
@ -0,0 +1,274 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 30,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 5235,
+        "logprob": -10.0625,
+        "text": "info"
+      },
+      {
+        "id": 29901,
+        "logprob": -3.2324219,
+        "text": ":"
+      },
+      {
+        "id": 13260,
+        "logprob": -10.625,
+        "text": "dav"
+      },
+      {
+        "id": 333,
+        "logprob": -0.08276367,
+        "text": "id"
+      },
+      {
+        "id": 8753,
+        "logprob": -7.5273438,
+        "text": "hol"
+      },
+      {
+        "id": 17559,
+        "logprob": -3.8476562,
+        "text": "tz"
+      },
+      {
+        "id": 763,
+        "logprob": -10.140625,
+        "text": "like"
+      },
+      {
+        "id": 10697,
+        "logprob": -10.1953125,
+        "text": "trees"
+      },
+      {
+        "id": 322,
+        "logprob": -2.5742188,
+        "text": "and"
+      },
+      {
+        "id": 756,
+        "logprob": -7.4882812,
+        "text": "has"
+      },
+      {
+        "id": 1023,
+        "logprob": -5.0507812,
+        "text": "two"
+      },
+      {
+        "id": 274,
+        "logprob": -5.3164062,
+        "text": "c"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.6694336,
+        "text": "ats"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.9995117,
+        "text": "."
+      },
+      {
+        "id": 29871,
+        "logprob": -4.2421875,
+        "text": ""
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 6377,
+        "logprob": -0.14916992,
+        "special": false,
+        "text": "{\""
+      },
+      {
+        "id": 29888,
+        "logprob": -0.13598633,
+        "special": false,
+        "text": "f"
+      },
+      {
+        "id": 12935,
+        "logprob": -0.017669678,
+        "special": false,
+        "text": "irs"
+      },
+      {
+        "id": 29873,
+        "logprob": -0.00085639954,
+        "special": false,
+        "text": "t"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.0054016113,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.13549805,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 19504,
+        "logprob": -0.8852539,
+        "special": false,
+        "text": "David"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.16394043,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29882,
+        "logprob": -0.08862305,
+        "special": false,
+        "text": "h"
+      },
+      {
+        "id": 711,
+        "logprob": -0.66259766,
+        "special": false,
+        "text": "ob"
+      },
+      {
+        "id": 1609,
+        "logprob": -5.51939e-05,
+        "special": false,
+        "text": "by"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.23120117,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29911,
+        "logprob": -2.3730469,
+        "special": false,
+        "text": "T"
+      },
+      {
+        "id": 11003,
+        "logprob": -0.032104492,
+        "special": false,
+        "text": "rees"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.22021484,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 4230,
+        "logprob": -0.06726074,
+        "special": false,
+        "text": "last"
+      },
+      {
+        "id": 1170,
+        "logprob": -0.003501892,
+        "special": false,
+        "text": "Name"
+      },
+      {
+        "id": 4710,
+        "logprob": -0.0045661926,
+        "special": false,
+        "text": "\":\""
+      },
+      {
+        "id": 29950,
+        "logprob": -0.12512207,
+        "special": false,
+        "text": "H"
+      },
+      {
+        "id": 14339,
+        "logprob": -0.009552002,
+        "special": false,
+        "text": "olt"
+      },
+      {
+        "id": 29920,
+        "logprob": -0.00042438507,
+        "special": false,
+        "text": "z"
+      },
+      {
+        "id": 3284,
+        "logprob": -0.11651611,
+        "special": false,
+        "text": "\",\""
+      },
+      {
+        "id": 29876,
+        "logprob": -0.29736328,
+        "special": false,
+        "text": "n"
+      },
+      {
+        "id": 398,
+        "logprob": -0.003030777,
+        "special": false,
+        "text": "um"
+      },
+      {
+        "id": 29907,
+        "logprob": -0.3774414,
+        "special": false,
+        "text": "C"
+      },
+      {
+        "id": 1446,
+        "logprob": -0.0003130436,
+        "special": false,
+        "text": "ats"
+      },
+      {
+        "id": 1115,
+        "logprob": -0.0021514893,
+        "special": false,
+        "text": "\":"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.071899414,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29913,
+        "logprob": -0.018997192,
+        "special": false,
+        "text": "}"
+      },
+      {
+        "id": 2,
+        "logprob": 0.0,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "{\"firstName\":\"David\",\"hobby\":\"Trees\",\"lastName\":\"Holtz\",\"numCats\":2}"
+}
--- a/integration-tests/models/snapshots/test_idefics/test_idefics.json
+++ b/integration-tests/models/snapshots/test_idefics/test_idefics.json
@ -11,92 +11,92 @@
      },
      {
        "id": 4911,
-        "logprob": -5.7851562,
+        "logprob": -6.9765625,
        "text": "User"
      },
      {
        "id": 29901,
-        "logprob": -0.006996155,
+        "logprob": -0.0059432983,
        "text": ":"
      },
      {
        "id": 32000,
-        "logprob": -0.81347656,
+        "logprob": -0.8408203,
        "text": "<fake_token_around_image>"
      },
      {
        "id": 32001,
-        "logprob": -6.687641e-05,
+        "logprob": -9.906292e-05,
        "text": "<image>"
      },
      {
        "id": 32000,
-        "logprob": -3.5762787e-07,
+        "logprob": -2.3841858e-07,
        "text": "<fake_token_around_image>"
      },
      {
        "id": 1815,
-        "logprob": -4.2148438,
+        "logprob": -4.1679688,
        "text": "Can"
      },
      {
        "id": 366,
-        "logprob": -0.014137268,
+        "logprob": -0.014099121,
        "text": "you"
      },
      {
        "id": 2649,
-        "logprob": -4.4335938,
+        "logprob": -4.4609375,
        "text": "tell"
      },
      {
        "id": 592,
-        "logprob": -0.2919922,
+        "logprob": -0.29882812,
        "text": "me"
      },
      {
        "id": 263,
-        "logprob": -4.2070312,
+        "logprob": -4.1445312,
        "text": "a"
      },
      {
        "id": 1407,
-        "logprob": -9.421875,
+        "logprob": -9.3828125,
        "text": "very"
      },
      {
        "id": 3273,
-        "logprob": -1.8720703,
+        "logprob": -1.9736328,
        "text": "short"
      },
      {
        "id": 5828,
-        "logprob": -0.26489258,
+        "logprob": -0.2800293,
        "text": "story"
      },
      {
        "id": 2729,
-        "logprob": -3.7441406,
+        "logprob": -3.5625,
        "text": "based"
      },
      {
        "id": 373,
-        "logprob": -0.0005393028,
+        "logprob": -0.0006427765,
        "text": "on"
      },
      {
        "id": 278,
-        "logprob": -0.140625,
+        "logprob": -0.13952637,
        "text": "the"
      },
      {
        "id": 1967,
-        "logprob": -0.06756592,
+        "logprob": -0.068115234,
        "text": "image"
      },
      {
        "id": 29973,
-        "logprob": -0.15454102,
+        "logprob": -0.16357422,
        "text": "?"
      }
    ],
@ -104,25 +104,25 @@
    "tokens": [
      {
        "id": 32002,
-        "logprob": -0.0019140244,
+        "logprob": -0.0026474,
        "special": true,
        "text": "<end_of_utterance>"
      },
      {
        "id": 29871,
-        "logprob": -8.404255e-05,
+        "logprob": -8.547306e-05,
        "special": false,
        "text": " "
      },
      {
        "id": 13,
-        "logprob": -1.7642975e-05,
+        "logprob": -1.7881393e-05,
        "special": false,
        "text": "\n"
      },
      {
        "id": 7900,
-        "logprob": -2.9802322e-06,
+        "logprob": -3.0994415e-06,
        "special": false,
        "text": "Ass"
      },
@ -140,30 +140,29 @@
      },
      {
        "id": 319,
-        "logprob": -0.91064453,
+        "logprob": -0.92529297,
        "special": false,
        "text": " A"
      },
      {
        "id": 696,
-        "logprob": -1.2412109,
+        "logprob": -1.1269531,
        "special": false,
        "text": " ro"
      },
      {
        "id": 15664,
-        "logprob": -0.0002439022,
+        "logprob": -0.00029492378,
        "special": false,
        "text": "oster"
      },
      {
        "id": 15028,
-        "logprob": -1.1630859,
+        "logprob": -1.1855469,
        "special": false,
        "text": " stands"
      }
-    ],
-    "top_tokens": null
+    ]
  },
  "generated_text": " \nAssistant: A rooster stands"
 }
--- a/integration-tests/models/snapshots/test_idefics/test_idefics_load.json
+++ b/integration-tests/models/snapshots/test_idefics/test_idefics_load.json
@ -12,92 +12,92 @@
        },
        {
          "id": 4911,
-          "logprob": -5.7851562,
+          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
-          "logprob": -0.006996155,
+          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
-          "logprob": -0.81347656,
+          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
-          "logprob": -6.687641e-05,
+          "logprob": -9.918213e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
-          "logprob": -4.2148438,
+          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
-          "logprob": -0.014137268,
+          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
-          "logprob": -4.4335938,
+          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
-          "logprob": -0.2919922,
+          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
-          "logprob": -4.2070312,
+          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
-          "logprob": -9.421875,
+          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
-          "logprob": -0.26489258,
+          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
-          "logprob": -3.7441406,
+          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
-          "logprob": -0.0005393028,
+          "logprob": -0.00064468384,
          "text": "on"
        },
        {
          "id": 278,
-          "logprob": -0.140625,
+          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
-          "logprob": -0.06756592,
+          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
-          "logprob": -0.15454102,
+          "logprob": -0.16381836,
          "text": "?"
        }
      ],
@ -105,19 +105,19 @@
      "tokens": [
        {
          "id": 32002,
-          "logprob": -0.0019140244,
+          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
-          "logprob": -8.392334e-05,
+          "logprob": -8.583069e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
-          "logprob": -1.7881393e-05,
+          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
@ -135,36 +135,35 @@
        },
        {
          "id": 29901,
-          "logprob": -3.0994415e-06,
+          "logprob": -3.2186508e-06,
          "special": false,
          "text": ":"
        },
        {
          "id": 319,
-          "logprob": -0.9057617,
+          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
-          "logprob": -1.2294922,
+          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
-          "logprob": -0.00024533272,
+          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
-          "logprob": -1.1640625,
+          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
-      ],
-      "top_tokens": null
+      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  },
@ -181,92 +180,92 @@
        },
        {
          "id": 4911,
-          "logprob": -5.7773438,
+          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
-          "logprob": -0.0070114136,
+          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
-          "logprob": -0.8208008,
+          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
-          "logprob": -6.699562e-05,
+          "logprob": -9.942055e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
-          "logprob": -0.014175415,
+          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
-          "logprob": -4.4296875,
+          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
-          "logprob": -0.29516602,
+          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
-          "logprob": -9.4296875,
+          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
-          "logprob": -0.26879883,
+          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
-          "logprob": -3.7675781,
+          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
-          "logprob": -0.0005354881,
+          "logprob": -0.0006451607,
          "text": "on"
        },
        {
          "id": 278,
-          "logprob": -0.13671875,
+          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
-          "logprob": -0.06719971,
+          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
-          "logprob": -0.15551758,
+          "logprob": -0.16381836,
          "text": "?"
        }
      ],
@ -274,19 +273,19 @@
      "tokens": [
        {
          "id": 32002,
-          "logprob": -0.0019130707,
+          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
-          "logprob": -8.392334e-05,
+          "logprob": -8.571148e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
-          "logprob": -1.7881393e-05,
+          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
@ -310,30 +309,29 @@
        },
        {
          "id": 319,
-          "logprob": -0.9013672,
+          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
-          "logprob": -1.2324219,
+          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
-          "logprob": -0.0002477169,
+          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
-          "logprob": -1.1660156,
+          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
-      ],
-      "top_tokens": null
+      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  },
@ -350,92 +348,92 @@
        },
        {
          "id": 4911,
-          "logprob": -5.7773438,
+          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
-          "logprob": -0.0070114136,
+          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
-          "logprob": -0.8208008,
+          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
-          "logprob": -6.699562e-05,
+          "logprob": -9.918213e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
-          "logprob": -0.014175415,
+          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
-          "logprob": -4.4296875,
+          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
-          "logprob": -0.29516602,
+          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
-          "logprob": -9.4296875,
+          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
-          "logprob": -0.26879883,
+          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
-          "logprob": -3.7675781,
+          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
-          "logprob": -0.0005354881,
+          "logprob": -0.00064468384,
          "text": "on"
        },
        {
          "id": 278,
-          "logprob": -0.13671875,
+          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
-          "logprob": -0.06719971,
+          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
-          "logprob": -0.15551758,
+          "logprob": -0.16381836,
          "text": "?"
        }
      ],
@ -443,19 +441,19 @@
      "tokens": [
        {
          "id": 32002,
-          "logprob": -0.001912117,
+          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
-          "logprob": -8.392334e-05,
+          "logprob": -8.59499e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
-          "logprob": -1.7762184e-05,
+          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
@ -479,30 +477,29 @@
        },
        {
          "id": 319,
-          "logprob": -0.9013672,
+          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
-          "logprob": -1.2324219,
+          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
-          "logprob": -0.0002477169,
+          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
-          "logprob": -1.1660156,
+          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
-      ],
-      "top_tokens": null
+      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  },
@ -519,92 +516,92 @@
        },
        {
          "id": 4911,
-          "logprob": -5.7773438,
+          "logprob": -6.9804688,
          "text": "User"
        },
        {
          "id": 29901,
-          "logprob": -0.0070114136,
+          "logprob": -0.006122589,
          "text": ":"
        },
        {
          "id": 32000,
-          "logprob": -0.8208008,
+          "logprob": -0.8417969,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 32001,
-          "logprob": -6.699562e-05,
+          "logprob": -9.942055e-05,
          "text": "<image>"
        },
        {
          "id": 32000,
-          "logprob": -3.5762787e-07,
+          "logprob": -2.3841858e-07,
          "text": "<fake_token_around_image>"
        },
        {
          "id": 1815,
-          "logprob": -4.2265625,
+          "logprob": -4.1679688,
          "text": "Can"
        },
        {
          "id": 366,
-          "logprob": -0.014175415,
+          "logprob": -0.014091492,
          "text": "you"
        },
        {
          "id": 2649,
-          "logprob": -4.4296875,
+          "logprob": -4.4726562,
          "text": "tell"
        },
        {
          "id": 592,
-          "logprob": -0.29516602,
+          "logprob": -0.2998047,
          "text": "me"
        },
        {
          "id": 263,
-          "logprob": -4.2109375,
+          "logprob": -4.15625,
          "text": "a"
        },
        {
          "id": 1407,
-          "logprob": -9.4296875,
+          "logprob": -9.3828125,
          "text": "very"
        },
        {
          "id": 3273,
-          "logprob": -1.8720703,
+          "logprob": -1.9716797,
          "text": "short"
        },
        {
          "id": 5828,
-          "logprob": -0.26879883,
+          "logprob": -0.27734375,
          "text": "story"
        },
        {
          "id": 2729,
-          "logprob": -3.7675781,
+          "logprob": -3.5605469,
          "text": "based"
        },
        {
          "id": 373,
-          "logprob": -0.0005354881,
+          "logprob": -0.0006451607,
          "text": "on"
        },
        {
          "id": 278,
-          "logprob": -0.13671875,
+          "logprob": -0.14160156,
          "text": "the"
        },
        {
          "id": 1967,
-          "logprob": -0.06719971,
+          "logprob": -0.06915283,
          "text": "image"
        },
        {
          "id": 29973,
-          "logprob": -0.15551758,
+          "logprob": -0.16381836,
          "text": "?"
        }
      ],
@ -612,19 +609,19 @@
      "tokens": [
        {
          "id": 32002,
-          "logprob": -0.001912117,
+          "logprob": -0.0026664734,
          "special": true,
          "text": "<end_of_utterance>"
        },
        {
          "id": 29871,
-          "logprob": -8.392334e-05,
+          "logprob": -8.571148e-05,
          "special": false,
          "text": " "
        },
        {
          "id": 13,
-          "logprob": -1.7762184e-05,
+          "logprob": -1.8119812e-05,
          "special": false,
          "text": "\n"
        },
@ -648,30 +645,29 @@
        },
        {
          "id": 319,
-          "logprob": -0.9013672,
+          "logprob": -0.9301758,
          "special": false,
          "text": " A"
        },
        {
          "id": 696,
-          "logprob": -1.2324219,
+          "logprob": -1.1279297,
          "special": false,
          "text": " ro"
        },
        {
          "id": 15664,
-          "logprob": -0.0002477169,
+          "logprob": -0.0002939701,
          "special": false,
          "text": "oster"
        },
        {
          "id": 15028,
-          "logprob": -1.1660156,
+          "logprob": -1.1865234,
          "special": false,
          "text": " stands"
        }
-      ],
-      "top_tokens": null
+      ]
    },
    "generated_text": " \nAssistant: A rooster stands"
  }
--- a/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_all_params.json
+++ b/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_all_params.json
@ -0,0 +1,65 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "stop_sequence",
+    "generated_tokens": 6,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 3735,
+        "logprob": -10.5,
+        "text": "Test"
+      },
+      {
+        "id": 2159,
+        "logprob": -12.140625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -1.0654297,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 1014,
+        "logprob": -2.7460938,
+        "special": false,
+        "text": "The"
+      },
+      {
+        "id": 6032,
+        "logprob": -1.359375,
+        "special": false,
+        "text": " purpose"
+      },
+      {
+        "id": 302,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 456,
+        "logprob": 0.0,
+        "special": false,
+        "text": " this"
+      },
+      {
+        "id": 1369,
+        "logprob": -0.40063477,
+        "special": false,
+        "text": " test"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request\nThe purpose of this test"
+}
--- a/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_load.json
+++ b/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_load.json
--- a/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_simple.json
+++ b/integration-tests/models/snapshots/test_llava_next/test_flash_llava_next_simple.json
@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 13,
+        "logprob": -0.00756073,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 13,
+        "logprob": -0.20117188,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 16114,
+        "logprob": -1.2597656,
+        "special": false,
+        "text": "Once"
+      },
+      {
+        "id": 3714,
+        "logprob": -0.20825195,
+        "special": false,
+        "text": " upon"
+      },
+      {
+        "id": 264,
+        "logprob": -0.00178051,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 727,
+        "logprob": -0.011955261,
+        "special": false,
+        "text": " time"
+      },
+      {
+        "id": 28725,
+        "logprob": -0.17541504,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 736,
+        "logprob": -0.91308594,
+        "special": false,
+        "text": " there"
+      },
+      {
+        "id": 403,
+        "logprob": -0.058410645,
+        "special": false,
+        "text": " was"
+      },
+      {
+        "id": 264,
+        "logprob": -0.009689331,
+        "special": false,
+        "text": " a"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nOnce upon a time, there was a"
+}
--- a/integration-tests/models/snapshots/test_mamba/test_mamba.json
+++ b/integration-tests/models/snapshots/test_mamba/test_mamba.json
@ -0,0 +1,73 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 187,
+        "logprob": -0.37890625,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 187,
+        "logprob": -0.26953125,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 30763,
+        "logprob": -1.1953125,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 4715,
+        "logprob": -0.53515625,
+        "special": false,
+        "text": " learning"
+      },
+      {
+        "id": 310,
+        "logprob": -0.625,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 247,
+        "logprob": -0.6796875,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 747,
+        "logprob": -2.0,
+        "special": false,
+        "text": " new"
+      },
+      {
+        "id": 1511,
+        "logprob": -2.3125,
+        "special": false,
+        "text": " type"
+      },
+      {
+        "id": 273,
+        "logprob": -0.0028533936,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 5145,
+        "logprob": -1.265625,
+        "special": false,
+        "text": " machine"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "\n\nDeep learning is a new type of machine"
+}
--- a/integration-tests/models/snapshots/test_mamba/test_mamba_all_params.json
+++ b/integration-tests/models/snapshots/test_mamba/test_mamba_all_params.json
@ -0,0 +1,99 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 2502,
+        "logprob": null,
+        "text": " red"
+      },
+      {
+        "id": 13,
+        "logprob": -2.734375,
+        "text": ","
+      },
+      {
+        "id": 8862,
+        "logprob": -3.6875,
+        "text": " yellow"
+      },
+      {
+        "id": 13,
+        "logprob": -0.40234375,
+        "text": ","
+      },
+      {
+        "id": 209,
+        "logprob": -8.25,
+        "text": " "
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 187,
+        "logprob": 0.0,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 395,
+        "logprob": -0.3125,
+        "special": false,
+        "text": "and"
+      },
+      {
+        "id": 4797,
+        "logprob": 0.0,
+        "special": false,
+        "text": " blue"
+      },
+      {
+        "id": 9830,
+        "logprob": -1.65625,
+        "special": false,
+        "text": " colors"
+      },
+      {
+        "id": 15,
+        "logprob": 0.0,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 329,
+        "logprob": -2.4375,
+        "special": false,
+        "text": " A"
+      },
+      {
+        "id": 1180,
+        "logprob": -1.953125,
+        "special": false,
+        "text": " number"
+      },
+      {
+        "id": 273,
+        "logprob": 0.0,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 1027,
+        "logprob": -1.5546875,
+        "special": false,
+        "text": " different"
+      },
+      {
+        "id": 3295,
+        "logprob": -0.97265625,
+        "special": false,
+        "text": " color"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "blue, red, yellow, \nand blue colors. A number of different color"
+}
--- a/integration-tests/models/snapshots/test_mamba/test_mamba_load.json
+++ b/integration-tests/models/snapshots/test_mamba/test_mamba_load.json
@ -0,0 +1,398 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.83984375,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -12.8125,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.84375,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.25,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.37890625,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.4296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.078125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.515625,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.6015625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.65625,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.109375,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.328125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0032653809,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.28125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.5546875,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.62890625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.64453125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.078125,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.28125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.3125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.5546875,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.62890625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.64453125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.078125,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.28125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.3125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1276,
+          "logprob": null,
+          "text": "What"
+        },
+        {
+          "id": 310,
+          "logprob": -0.80078125,
+          "text": " is"
+        },
+        {
+          "id": 18147,
+          "logprob": -13.25,
+          "text": " Deep"
+        },
+        {
+          "id": 20727,
+          "logprob": -2.828125,
+          "text": " Learning"
+        },
+        {
+          "id": 32,
+          "logprob": -1.1953125,
+          "text": "?"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 187,
+          "logprob": -0.296875,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 187,
+          "logprob": -0.3359375,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30763,
+          "logprob": -1.2578125,
+          "special": false,
+          "text": "Deep"
+        },
+        {
+          "id": 4715,
+          "logprob": -0.5546875,
+          "special": false,
+          "text": " learning"
+        },
+        {
+          "id": 310,
+          "logprob": -0.62890625,
+          "special": false,
+          "text": " is"
+        },
+        {
+          "id": 247,
+          "logprob": -0.64453125,
+          "special": false,
+          "text": " a"
+        },
+        {
+          "id": 747,
+          "logprob": -2.078125,
+          "special": false,
+          "text": " new"
+        },
+        {
+          "id": 1511,
+          "logprob": -2.28125,
+          "special": false,
+          "text": " type"
+        },
+        {
+          "id": 273,
+          "logprob": -0.0030670166,
+          "special": false,
+          "text": " of"
+        },
+        {
+          "id": 5145,
+          "logprob": -1.3125,
+          "special": false,
+          "text": " machine"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "\n\nDeep learning is a new type of machine"
+  }
+]
--- a/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/snapshots/test_mt0_base/test_mt0_base_all_params.json
@ -1,8 +1,8 @@
 {
  "details": {
    "best_of_sequences": null,
-    "finish_reason": "eos_token",
-    "generated_tokens": 9,
+    "finish_reason": "length",
+    "generated_tokens": 10,
    "prefill": [
      {
        "id": 0,
@ -14,7 +14,7 @@
    "tokens": [
      {
        "id": 16017,
-        "logprob": -0.30908203,
+        "logprob": 0.0,
        "special": false,
        "text": " blue"
      },
@ -26,39 +26,45 @@
      },
      {
        "id": 259,
-        "logprob": -0.28271484,
+        "logprob": -0.4716797,
        "special": false,
        "text": " "
      },
      {
-        "id": 15484,
-        "logprob": -1.7929688,
+        "id": 261,
+        "logprob": -0.044677734,
        "special": false,
-        "text": "appear"
+        "text": ","
      },
      {
-        "id": 345,
-        "logprob": -0.8935547,
+        "id": 35622,
+        "logprob": -0.79589844,
        "special": false,
-        "text": "ed"
+        "text": " cloud"
      },
      {
-        "id": 281,
+        "id": 263,
+        "logprob": -1.2958984,
+        "special": false,
+        "text": "s"
+      },
+      {
+        "id": 305,
        "logprob": 0.0,
        "special": false,
-        "text": " in"
+        "text": " and"
      },
      {
-        "id": 287,
+        "id": 35622,
+        "logprob": -1.1630859,
+        "special": false,
+        "text": " cloud"
+      },
+      {
+        "id": 263,
        "logprob": 0.0,
        "special": false,
-        "text": " the"
-      },
-      {
-        "id": 20495,
-        "logprob": -0.32299805,
-        "special": false,
-        "text": " sky"
+        "text": "s"
      },
      {
        "id": 1,
@ -66,7 +72,8 @@
        "special": true,
        "text": "</s>"
      }
-    ]
+    ],
+    "top_tokens": null
  },
-  "generated_text": "Why is the sky blue?blue sky appeared in the sky"
+  "generated_text": "Why is the sky blue?blue sky, clouds and clouds"
 }
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_no_tools.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_no_tools.json
@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "As of today, there is a Update available for the Brooklyn, New York, area. According to the latest forecast, it's warm with high temperatures throughout the day. It's forecasted at 75°F for today and 77°F for tomorrow. However, in autumn, the weather typically changes drastically, becoming cooler and wetter. You can find the current weather forecast for the area through your local weather service. Additionally",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1710795556,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.0-native",
+  "usage": {
+    "completion_tokens": 100,
+    "prompt_tokens": 60,
+    "total_tokens": 160
+  }
+}
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools.json
@ -0,0 +1,40 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "description": null,
+              "name": "tools",
+              "parameters": {
+                "format": "celsius",
+                "location": "New York, NY",
+                "num_days": 14
+              }
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1710795556,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.0-native",
+  "usage": {
+    "completion_tokens": 29,
+    "prompt_tokens": 316,
+    "total_tokens": 345
+  }
+}
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_auto.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@ -0,0 +1,40 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "description": null,
+              "name": "tools",
+              "parameters": {
+                "format": "celsius",
+                "location": "New York, NY",
+                "num_days": 14
+              }
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1710795557,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.0-native",
+  "usage": {
+    "completion_tokens": 29,
+    "prompt_tokens": 316,
+    "total_tokens": 345
+  }
+}
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_choice.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@ -0,0 +1,39 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": null,
+        "name": null,
+        "role": "assistant",
+        "tool_calls": [
+          {
+            "function": {
+              "description": null,
+              "name": "tools",
+              "parameters": {
+                "format": "celsius",
+                "location": "New York, NY"
+              }
+            },
+            "id": 0,
+            "type": "function"
+          }
+        ]
+      },
+      "usage": null
+    }
+  ],
+  "created": 1710795557,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.0-native",
+  "usage": {
+    "completion_tokens": 21,
+    "prompt_tokens": 187,
+    "total_tokens": 208
+  }
+}
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@ -0,0 +1,27 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": null,
+        "role": "assistant",
+        "tool_calls": {
+          "function": {
+            "arguments": "</s>",
+            "name": null
+          },
+          "id": "",
+          "index": 0,
+          "type": "function"
+        }
+      },
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1710795499,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.0-native"
+}
--- a/integration-tests/models/test_flash_awq.py
+++ b/integration-tests/models/test_flash_awq.py
@ -18,7 +18,6 @@ async def flash_llama_awq(flash_llama_awq_handle):


@pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
    response = await flash_llama_awq.generate(
        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
@ -33,7 +32,6 @@ async def test_flash_llama_awq(flash_llama_awq, response_snapshot):


@pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
    response = await flash_llama_awq.generate(
        "What is Deep Learning?",
@ -55,7 +53,6 @@ async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):


@pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_llama_awq_load(flash_llama_awq, generate_load, response_snapshot):
    responses = await generate_load(
        flash_llama_awq, "What is Deep Learning?", max_new_tokens=10, n=4
--- a/integration-tests/models/test_flash_awq_sharded.py
+++ b/integration-tests/models/test_flash_awq_sharded.py
@ -18,7 +18,6 @@ async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):


@pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapshot):
    response = await flash_llama_awq_sharded.generate(
        "What is Deep Learning?", max_new_tokens=10, decoder_input_details=True
@ -33,7 +32,6 @@ async def test_flash_llama_awq_sharded(flash_llama_awq_sharded, response_snapsho


@pytest.mark.asyncio
-@pytest.mark.private
 async def test_flash_llama_awq_load_sharded(
    flash_llama_awq_sharded, generate_load, response_snapshot
 ):
--- a/integration-tests/models/test_flash_gemma.py
+++ b/integration-tests/models/test_flash_gemma.py
@ -0,0 +1,61 @@
+import pytest
+
+
+@pytest.fixture(scope="module")
+def flash_gemma_handle(launcher):
+    with launcher("gg-hf/gemma-2b", num_shard=1) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def flash_gemma(flash_gemma_handle):
+    await flash_gemma_handle.health(300)
+    return flash_gemma_handle.client
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma(flash_gemma, response_snapshot):
+    response = await flash_gemma.generate(
+        "Test request", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_all_params(flash_gemma, response_snapshot):
+    response = await flash_gemma.generate(
+        "Test request",
+        max_new_tokens=10,
+        repetition_penalty=1.2,
+        return_full_text=True,
+        stop_sequences=["test"],
+        temperature=0.5,
+        top_p=0.9,
+        top_k=10,
+        truncate=5,
+        typical_p=0.9,
+        watermark=True,
+        decoder_input_details=True,
+        seed=0,
+    )
+
+    assert response.details.generated_tokens == 10
+    assert response == response_snapshot
+
+
+@pytest.mark.skip
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_gemma_load(flash_gemma, generate_load, response_snapshot):
+    responses = await generate_load(flash_gemma, "Test request", max_new_tokens=10, n=4)
+
+    assert len(responses) == 4
+    assert all([r.generated_text == responses[0].generated_text for r in responses])
+
+    assert responses == response_snapshot
--- a/Show More
+++ b/Show More