diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 8304c8d1..220b2fa3 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -8,6 +8,15 @@ on:
     tags:
       - 'v*'
   pull_request:
+    paths:
+      - ".github/workflows/build.yaml"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+      - "Dockerfile"
     branches:
       - 'main'
 
@@ -15,6 +24,10 @@ jobs:
   build-and-push-image:
     runs-on: ubuntu-latest
     steps:
+      - name: Initialize Docker Buildx
+        uses: docker/setup-buildx-action@v2.0.0
+        with:
+          install: true
       - name: Tailscale
         uses: tailscale/github-action@v1
         with:
@@ -65,5 +78,5 @@ jobs:
           platforms: 'linux/amd64'
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=registry,ref=ghcr.io/huggingface/text-generation-inference:latest
-          cache-to: type=inline
\ No newline at end of file
+          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
+          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max
\ No newline at end of file
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 8b659fe2..1a45ad04 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -3,14 +3,23 @@ name: Server Tests
 on:
   pull_request:
     paths:
+      - ".github/workflows/tests.yaml"
       - "server/**"
       - "proto/**"
       - "router/**"
       - "launcher/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
 
 jobs:
   run_tests:
     runs-on: ubuntu-20.04
+
+    env:
+      SCCACHE_GHA_ENABLED: "on"
+      RUSTC_WRAPPER: /usr/local/bin/sccache
+      SCCACHE: 0.3.3
+
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python
@@ -25,19 +34,38 @@ jobs:
           components: rustfmt, clippy
       - name: Install Protoc
         uses: arduino/setup-protoc@v1
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: model_cache
+      - name: Install sccache
+        run: |
+          curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
+          chmod +x /usr/local/bin/sccache
+      - name: configure sccache
+        uses: actions/github-script@v6
         with:
-          path: ~/.cache/huggingface/
-          key: models
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+            core.exportVariable('SCCACHE_GHA_CACHE_TO', 'sccache-${{runner.os}}-${{github.ref_name}}');
+            core.exportVariable('SCCACHE_GHA_CACHE_FROM', 'sccache-${{runner.os}}-main,sccache-${{runner.os}}-');
+      - name: cargo registry cache
+        uses: actions/cache@v3
+        with:
+          key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-${{ github.sha }}
+          restore-keys: |
+            cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-
+            cargo-${{ runner.os }}-
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
       - name: Install
         run: |
           make install
       - name: Run server tests
         run: |
           pip install pytest
-          pytest -sv server/tests
+          HF_HUB_ENABLE_HF_TRANSFER=1 pytest -sv server/tests
       - name: Run Rust tests
         run: |
           cargo test
+      - name: sccache stats
+        run: |
+          /usr/local/bin/sccache --show-stats
diff --git a/Cargo.lock b/Cargo.lock
index e8a28bf9..9db5f213 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8,6 +8,17 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
+[[package]]
+name = "ahash"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
+dependencies = [
+ "getrandom",
+ "once_cell",
+ "version_check",
+]
+
 [[package]]
 name = "aho-corasick"
 version = "0.7.20"
@@ -34,19 +45,20 @@ checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
 
 [[package]]
 name = "async-stream"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e"
+checksum = "ad445822218ce64be7a341abfb0b1ea43b5c23aa83902542a4542e78309d8e5e"
 dependencies = [
  "async-stream-impl",
  "futures-core",
+ "pin-project-lite",
 ]
 
 [[package]]
 name = "async-stream-impl"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27"
+checksum = "e4655ae1a7b0cdf149156f780c5bf3f1352bc53cbd9e0a361a7ef7b22947e965"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -83,9 +95,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "axum"
-version = "0.6.4"
+version = "0.6.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc"
+checksum = "6137c6234afb339e75e764c866e3594900f0211e1315d33779f269bbe2ec6967"
 dependencies = [
  "async-trait",
  "axum-core",
@@ -109,7 +121,7 @@ dependencies = [
  "sync_wrapper",
  "tokio",
  "tower",
- "tower-http",
+ "tower-http 0.4.0",
  "tower-layer",
  "tower-service",
 ]
@@ -142,7 +154,7 @@ dependencies = [
  "http",
  "opentelemetry",
  "tower",
- "tower-http",
+ "tower-http 0.3.5",
  "tracing",
  "tracing-opentelemetry",
 ]
@@ -265,9 +277,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.1.4"
+version = "4.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76"
+checksum = "c3d7ae14b20b94cb02149ed21a86c423859cbe18dc7ed69845cace50e52b40a5"
 dependencies = [
  "bitflags",
  "clap_derive",
@@ -280,9 +292,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.1.0"
+version = "4.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
+checksum = "44bec8e5c9d09e439c4335b1af0abaab56dcf3b94999a936e1bb47b9134288f0"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -293,9 +305,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.3.1"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
+checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
 dependencies = [
  "os_str_bytes",
 ]
@@ -349,9 +361,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.6"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
+checksum = "cf2b3e8478797446514c91ef04bafcb59faba183e621ad488df88983cc14128c"
 dependencies = [
  "cfg-if",
  "crossbeam-utils",
@@ -359,9 +371,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
+checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
 dependencies = [
  "cfg-if",
  "crossbeam-epoch",
@@ -370,9 +382,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.13"
+version = "0.9.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a"
+checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
 dependencies = [
  "autocfg",
  "cfg-if",
@@ -383,9 +395,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.14"
+version = "0.8.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
+checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
 dependencies = [
  "cfg-if",
 ]
@@ -575,9 +587,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "1.8.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
+checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
 dependencies = [
  "instant",
 ]
@@ -774,7 +786,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "grpc-metadata"
-version = "0.1.0"
+version = "0.4.0"
 dependencies = [
  "opentelemetry",
  "tonic",
@@ -784,9 +796,9 @@ dependencies = [
 
 [[package]]
 name = "h2"
-version = "0.3.15"
+version = "0.3.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4"
+checksum = "5be7b54589b581f624f566bf5d8eb2bab1db736c51528720b6bd36b96b55924d"
 dependencies = [
  "bytes",
  "fnv",
@@ -806,6 +818,9 @@ name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+dependencies = [
+ "ahash",
+]
 
 [[package]]
 name = "heck"
@@ -839,9 +854,9 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
 
 [[package]]
 name = "http"
-version = "0.2.8"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
+checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482"
 dependencies = [
  "bytes",
  "fnv",
@@ -1004,9 +1019,9 @@ checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146"
 
 [[package]]
 name = "is-terminal"
-version = "0.4.3"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22e18b0a45d56fe973d6db23972bf5bc46f988a4a2385deac9cc29572f09daef"
+checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
 dependencies = [
  "hermit-abi 0.3.1",
  "io-lifetimes",
@@ -1093,6 +1108,15 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "mach"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b823e83b2affd8f40a9ee8c29dbc56404c1e34cd2710921f2801e2cf29527afa"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "macro_rules_attribute"
 version = "0.1.3"
@@ -1132,13 +1156,71 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
 [[package]]
 name = "memoffset"
-version = "0.7.1"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
 dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "metrics"
+version = "0.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b9b8653cec6897f73b519a43fba5ee3d50f62fe9af80b428accdcc093b4a849"
+dependencies = [
+ "ahash",
+ "metrics-macros",
+ "portable-atomic",
+]
+
+[[package]]
+name = "metrics-exporter-prometheus"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8603921e1f54ef386189335f288441af761e0fc61bcb552168d9cedfe63ebc70"
+dependencies = [
+ "hyper",
+ "indexmap",
+ "ipnet",
+ "metrics",
+ "metrics-util",
+ "parking_lot",
+ "portable-atomic",
+ "quanta",
+ "thiserror",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "metrics-macros"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731f8ecebd9f3a4aa847dfe75455e4757a45da40a7793d2f0b1f9b6ed18b23f3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "metrics-util"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d24dc2dbae22bff6f1f9326ffce828c9f07ef9cc1e8002e5279f845432a30a"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "hashbrown",
+ "metrics",
+ "num_cpus",
+ "parking_lot",
+ "portable-atomic",
+ "quanta",
+ "sketches-ddsketch",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.16"
@@ -1172,14 +1254,14 @@ dependencies = [
 
 [[package]]
 name = "mio"
-version = "0.8.5"
+version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de"
+checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
 dependencies = [
  "libc",
  "log",
  "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -1268,9 +1350,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "once_cell"
-version = "1.17.0"
+version = "1.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
 
 [[package]]
 name = "onig"
@@ -1514,6 +1596,12 @@ version = "0.3.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
 
+[[package]]
+name = "portable-atomic"
+version = "0.3.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26f6a7b87c2e435a3241addceeeff740ff8b7e76b74c13bf9acb17fa454ea00b"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -1565,9 +1653,9 @@ dependencies = [
 
 [[package]]
 name = "prost"
-version = "0.11.6"
+version = "0.11.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21dc42e00223fc37204bd4aa177e69420c604ca4a183209a8f9de30c6d934698"
+checksum = "e48e50df39172a3e7eb17e14642445da64996989bc212b583015435d39a58537"
 dependencies = [
  "bytes",
  "prost-derive",
@@ -1575,9 +1663,9 @@ dependencies = [
 
 [[package]]
 name = "prost-build"
-version = "0.11.6"
+version = "0.11.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a3f8ad728fb08fe212df3c05169e940fbb6d9d16a877ddde14644a983ba2012e"
+checksum = "2c828f93f5ca4826f97fedcbd3f9a536c16b12cff3dbbb4a007f932bbad95b12"
 dependencies = [
  "bytes",
  "heck",
@@ -1597,9 +1685,9 @@ dependencies = [
 
 [[package]]
 name = "prost-derive"
-version = "0.11.6"
+version = "0.11.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bda8c0881ea9f722eb9629376db3d0b903b462477c1aafcb0566610ac28ac5d"
+checksum = "4ea9b0f8cbe5e15a8a042d030bd96668db28ecb567ec37d691971ff5731d2b1b"
 dependencies = [
  "anyhow",
  "itertools 0.10.5",
@@ -1610,14 +1698,29 @@ dependencies = [
 
 [[package]]
 name = "prost-types"
-version = "0.11.6"
+version = "0.11.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5e0526209433e96d83d750dd81a99118edbc55739e7e61a46764fd2ad537788"
+checksum = "379119666929a1afd7a043aa6cf96fa67a6dce9af60c88095a4686dbce4c9c88"
 dependencies = [
- "bytes",
  "prost",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7e31331286705f455e56cca62e0e717158474ff02b7936c1fa596d983f4ae27"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "mach",
+ "once_cell",
+ "raw-cpuid",
+ "wasi 0.10.2+wasi-snapshot-preview1",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.23"
@@ -1657,6 +1760,15 @@ dependencies = [
  "getrandom",
 ]
 
+[[package]]
+name = "raw-cpuid"
+version = "10.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332"
+dependencies = [
+ "bitflags",
+]
+
 [[package]]
 name = "rayon"
 version = "1.6.1"
@@ -1736,15 +1848,6 @@ version = "0.6.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
 
-[[package]]
-name = "remove_dir_all"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "reqwest"
 version = "0.11.14"
@@ -1973,18 +2076,24 @@ dependencies = [
 
 [[package]]
 name = "signal-hook-registry"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0"
+checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
 dependencies = [
  "libc",
 ]
 
 [[package]]
-name = "slab"
-version = "0.4.7"
+name = "sketches-ddsketch"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
+checksum = "ceb945e54128e09c43d8e4f1277851bd5044c6fc540bbaa2ad888f60b3da9ae7"
+
+[[package]]
+name = "slab"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d"
 dependencies = [
  "autocfg",
 ]
@@ -1997,9 +2106,9 @@ checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
 
 [[package]]
 name = "socket2"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd"
+checksum = "95a21dcece9b5991cfd1ece74654c8e3d0d5aab499d359b0395e38229c0bb5a3"
 dependencies = [
  "libc",
  "winapi",
@@ -2053,9 +2162,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "1.0.107"
+version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2081,16 +2190,15 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.3.0"
+version = "3.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
+checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
 dependencies = [
  "cfg-if",
  "fastrand",
- "libc",
  "redox_syscall",
- "remove_dir_all",
- "winapi",
+ "rustix",
+ "windows-sys 0.42.0",
 ]
 
 [[package]]
@@ -2104,7 +2212,7 @@ dependencies = [
 
 [[package]]
 name = "text-generation-client"
-version = "0.2.1"
+version = "0.4.0"
 dependencies = [
  "futures",
  "grpc-metadata",
@@ -2121,9 +2229,9 @@ dependencies = [
 
 [[package]]
 name = "text-generation-launcher"
-version = "0.2.1"
+version = "0.4.0"
 dependencies = [
- "clap 4.1.4",
+ "clap 4.1.8",
  "ctrlc",
  "float_eq",
  "reqwest",
@@ -2136,18 +2244,21 @@ dependencies = [
 
 [[package]]
 name = "text-generation-router"
-version = "0.2.1"
+version = "0.4.0"
 dependencies = [
  "async-stream",
  "axum",
  "axum-tracing-opentelemetry",
- "clap 4.1.4",
+ "clap 4.1.8",
  "futures",
+ "metrics",
+ "metrics-exporter-prometheus",
  "nohash-hasher",
  "opentelemetry",
  "opentelemetry-otlp",
  "parking_lot",
  "rand",
+ "reqwest",
  "serde",
  "serde_json",
  "text-generation-client",
@@ -2155,6 +2266,7 @@ dependencies = [
  "tokenizers",
  "tokio",
  "tokio-stream",
+ "tower-http 0.3.5",
  "tracing",
  "tracing-opentelemetry",
  "tracing-subscriber",
@@ -2193,9 +2305,9 @@ dependencies = [
 
 [[package]]
 name = "thread_local"
-version = "1.1.6"
+version = "1.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50f297120ff9d4efe680df143d5631bba9c75fa371992b7fcb33eb3453cb0a07"
+checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -2203,12 +2315,11 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.1.45"
+version = "0.1.43"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
+checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438"
 dependencies = [
  "libc",
- "wasi 0.10.0+wasi-snapshot-preview1",
  "winapi",
 ]
 
@@ -2264,9 +2375,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.25.0"
+version = "1.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8e00990ebabbe4c14c08aca901caed183ecd5c09562a12c824bb53d3c3fd3af"
+checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64"
 dependencies = [
  "autocfg",
  "bytes",
@@ -2279,7 +2390,7 @@ dependencies = [
  "signal-hook-registry",
  "socket2",
  "tokio-macros",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -2315,9 +2426,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.11"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce"
+checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -2326,9 +2437,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.6"
+version = "0.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6a3b08b64e6dfad376fa2432c7b1f01522e37a623c3050bc95db2d3ff21583"
+checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2"
 dependencies = [
  "bytes",
  "futures-core",
@@ -2417,12 +2528,30 @@ dependencies = [
  "http-body",
  "http-range-header",
  "pin-project-lite",
- "tower",
  "tower-layer",
  "tower-service",
  "tracing",
 ]
 
+[[package]]
+name = "tower-http"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d1d42a9b3f3ec46ba828e8d376aec14592ea199f70a06a548587ecd1c4ab658"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.2"
@@ -2627,9 +2756,9 @@ dependencies = [
 
 [[package]]
 name = "utoipa"
-version = "3.0.1"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3920fa753064b1be7842bea26175ffa0dfc4a8f30bcb52b8ff03fddf8889914c"
+checksum = "a15f6da6a2b471134ca44b7d18e8a76d73035cf8b3ed24c4dd5ca6a63aa439c5"
 dependencies = [
  "indexmap",
  "serde",
@@ -2639,9 +2768,9 @@ dependencies = [
 
 [[package]]
 name = "utoipa-gen"
-version = "3.0.1"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "720298fac6efca20df9e457e67a1eab41a20d1c3101380b5c4dca1ca60ae0062"
+checksum = "6f2e33027986a4707b3f5c37ed01b33d0e5a53da30204b52ff18f80600f1d0ec"
 dependencies = [
  "proc-macro-error",
  "proc-macro2",
@@ -2712,9 +2841,9 @@ dependencies = [
 
 [[package]]
 name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
+version = "0.10.2+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
+checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
 
 [[package]]
 name = "wasi"
diff --git a/Dockerfile b/Dockerfile
index 907379dc..5fbf8985 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,15 @@
-FROM rust:1.67 as router-builder
+FROM lukemathwalker/cargo-chef:latest-rust-1.67 AS chef
+WORKDIR /usr/src
+
+FROM chef as planner
+COPY Cargo.toml Cargo.toml
+COPY rust-toolchain.toml rust-toolchain.toml
+COPY proto proto
+COPY router router
+COPY launcher launcher
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM chef AS builder
 
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
@@ -6,26 +17,15 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
     rm -f $PROTOC_ZIP
 
-WORKDIR /usr/src
+COPY --from=planner /usr/src/recipe.json recipe.json
+RUN cargo chef cook --release --recipe-path recipe.json
 
+COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
 COPY proto proto
 COPY router router
-
-WORKDIR /usr/src/router
-
-RUN cargo install --path .
-
-FROM rust:1.67 as launcher-builder
-
-WORKDIR /usr/src
-
-COPY rust-toolchain.toml rust-toolchain.toml
 COPY launcher launcher
-
-WORKDIR /usr/src/launcher
-
-RUN cargo install --path .
+RUN cargo build --release
 
 FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 
@@ -33,6 +33,7 @@ ENV LANG=C.UTF-8 \
     LC_ALL=C.UTF-8 \
     DEBIAN_FRONTEND=noninteractive \
     HUGGINGFACE_HUB_CACHE=/data \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
     MODEL_ID=bigscience/bloom-560m \
     QUANTIZE=false \
     NUM_SHARD=1 \
@@ -68,9 +69,9 @@ RUN cd server && \
     /opt/miniconda/envs/text-generation/bin/pip install ".[bnb]" --no-cache-dir
 
 # Install router
-COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
+COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
-COPY --from=launcher-builder /usr/local/cargo/bin/text-generation-launcher /usr/local/bin/text-generation-launcher
+COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
 
 ENTRYPOINT ["text-generation-launcher"]
 CMD ["--json-output"]
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 08217730..3defd886 100644
--- a/Makefile
+++ b/Makefile
@@ -13,25 +13,25 @@ server-dev:
 	cd server && make run-dev
 
 router-dev:
-	cd router && cargo run
+	cd router && cargo run -- --port 8080
 
 integration-tests: install-router install-launcher
 	cargo test
 
 python-tests:
-	cd server && pytest tests
+	cd server && HF_HUB_ENABLE_HF_TRANSFER=1 pytest tests
 
 run-bloom-560m:
-	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2
+	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --port 8080
 
 run-bloom-560m-quantize:
-	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
+	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize --port 8080
 
 download-bloom:
-	text-generation-server download-weights bigscience/bloom
+	HF_HUB_ENABLE_HF_TRANSFER=1 text-generation-server download-weights bigscience/bloom
 
 run-bloom:
-	text-generation-launcher --model-id bigscience/bloom --num-shard 8
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --port 8080
 
 run-bloom-quantize:
-	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize
\ No newline at end of file
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
\ No newline at end of file
diff --git a/README.md b/README.md
index 040d088c..ee724487 100644
--- a/README.md
+++ b/README.md
@@ -39,27 +39,30 @@ to power LLMs api-inference widgets.
   
 ## Features
 
+- Serve the most popular Large Language Models with a simple launcher
+- Tensor Parallelism for faster inference on multiple GPUs
 - Token streaming using Server-Sent Events (SSE)
 - [Dynamic batching of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput
 - Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
-- 45ms per token generation for BLOOM with 8xA100 80GB
+- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 - Logits warpers (temperature scaling, topk, repetition penalty ...)
 - Stop sequences
 - Log probabilities
-- Distributed tracing with Open Telemetry
+- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
 
-## Officially supported models
+## Officially supported architectures
 
 - [BLOOM](https://huggingface.co/bigscience/bloom)
 - [BLOOMZ](https://huggingface.co/bigscience/bloomz)
 - [MT0-XXL](https://huggingface.co/bigscience/mt0-xxl)
-- ~~[Galactica](https://huggingface.co/facebook/galactica-120b)~~ (deactivated)
+- [Galactica](https://huggingface.co/facebook/galactica-120b)
 - [SantaCoder](https://huggingface.co/bigcode/santacoder)
 - [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b)
 - [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl)
+- [FLAN-UL2](https://huggingface.co/google/flan-ul2)
 
-Other models are supported on a best effort basis using:
+Other architectures are supported on a best effort basis using:
 
 `AutoModelForCausalLM.from_pretrained(<model>, device_map="auto")`
 
@@ -80,24 +83,42 @@ volume=$PWD/data # share a volume with the Docker container to avoid downloading
 
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard
 ```
+**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
 
 You can then query the model using either the `/generate` or `/generate_stream` routes:
 
 ```shell
 curl 127.0.0.1:8080/generate \
     -X POST \
-    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
     -H 'Content-Type: application/json'
 ```
 
 ```shell
 curl 127.0.0.1:8080/generate_stream \
     -X POST \
-    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17}}' \
     -H 'Content-Type: application/json'
 ```
 
-**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
+or from Python:
+
+```shell
+pip install text-generation
+```
+
+```python
+from text_generation import Client
+
+client = Client("http://127.0.0.1:8080")
+print(client.generate("What is Deep Learning?", max_new_tokens=17).generated_text)
+
+text = ""
+for response in client.generate_stream("What is Deep Learning?", max_new_tokens=17):
+    if not response.token.special:
+        text += response.token.text
+print(text)
+```
 
 ### API documentation
 
@@ -191,7 +212,7 @@ Be aware that the official Docker image has them enabled by default.
 
 ### Download
 
-First you need to download the weights:
+It is advised to download the weights ahead of time with the following command:
 
 ```shell
 make download-bloom
diff --git a/clients/python/.gitignore b/clients/python/.gitignore
new file mode 100644
index 00000000..5758ba92
--- /dev/null
+++ b/clients/python/.gitignore
@@ -0,0 +1,158 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+text_generation/__pycache__/
+text_generation/pb/__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+transformers
+safetensors
\ No newline at end of file
diff --git a/clients/python/Makefile b/clients/python/Makefile
new file mode 100644
index 00000000..8b4334bd
--- /dev/null
+++ b/clients/python/Makefile
@@ -0,0 +1,6 @@
+unit-tests:
+	python -m pytest --cov=text_generation tests
+
+install:
+	pip install pip --upgrade
+	pip install -e .
\ No newline at end of file
diff --git a/clients/python/README.md b/clients/python/README.md
new file mode 100644
index 00000000..f509e65c
--- /dev/null
+++ b/clients/python/README.md
@@ -0,0 +1,196 @@
+# Text Generation
+
+The Hugging Face Text Generation Python library provides a convenient way of interfacing with a
+`text-generation-inference` instance running on
+[Hugging Face Inference Endpoints](https://huggingface.co/inference-endpoints) or on the Hugging Face Hub.
+
+## Get Started
+
+### Install
+
+```shell
+pip install text-generation
+```
+
+### Inference API Usage
+
+```python
+from text_generation import InferenceAPIClient
+
+client = InferenceAPIClient("bigscience/bloomz")
+text = client.generate("Why is the sky blue?").generated_text
+print(text)
+# ' Rayleigh scattering'
+
+# Token Streaming
+text = ""
+for response in client.generate_stream("Why is the sky blue?"):
+    if not response.token.special:
+        text += response.token.text
+
+print(text)
+# ' Rayleigh scattering'
+```
+
+or with the asynchronous client:
+
+```python
+from text_generation import InferenceAPIAsyncClient
+
+client = InferenceAPIAsyncClient("bigscience/bloomz")
+response = await client.generate("Why is the sky blue?")
+print(response.generated_text)
+# ' Rayleigh scattering'
+
+# Token Streaming
+text = ""
+async for response in client.generate_stream("Why is the sky blue?"):
+    if not response.token.special:
+        text += response.token.text
+
+print(text)
+# ' Rayleigh scattering'
+```
+
+### Hugging Face Inference Endpoint usage
+
+```python
+from text_generation import Client
+
+endpoint_url = "https://YOUR_ENDPOINT.endpoints.huggingface.cloud"
+
+client = Client(endpoint_url)
+text = client.generate("Why is the sky blue?").generated_text
+print(text)
+# ' Rayleigh scattering'
+
+# Token Streaming
+text = ""
+for response in client.generate_stream("Why is the sky blue?"):
+    if not response.token.special:
+        text += response.token.text
+
+print(text)
+# ' Rayleigh scattering'
+```
+
+or with the asynchronous client:
+
+```python
+from text_generation import AsyncClient
+
+endpoint_url = "https://YOUR_ENDPOINT.endpoints.huggingface.cloud"
+
+client = AsyncClient(endpoint_url)
+response = await client.generate("Why is the sky blue?")
+print(response.generated_text)
+# ' Rayleigh scattering'
+
+# Token Streaming
+text = ""
+async for response in client.generate_stream("Why is the sky blue?"):
+    if not response.token.special:
+        text += response.token.text
+
+print(text)
+# ' Rayleigh scattering'
+```
+
+### Types
+
+```python
+# Prompt tokens
+class PrefillToken:
+    # Token ID from the model tokenizer
+    id: int
+    # Token text
+    text: str
+    # Logprob
+    # Optional since the logprob of the first token cannot be computed
+    logprob: Optional[float]
+
+
+# Generated tokens
+class Token:
+    # Token ID from the model tokenizer
+    id: int
+    # Token text
+    text: str
+    # Logprob
+    logprob: float
+    # Is the token a special token
+    # Can be used to ignore tokens when concatenating
+    special: bool
+
+
+# Generation finish reason
+class FinishReason(Enum):
+    # number of generated tokens == `max_new_tokens`
+    Length = "length"
+    # the model generated its end of sequence token
+    EndOfSequenceToken = "eos_token"
+    # the model generated a text included in `stop_sequences`
+    StopSequence = "stop_sequence"
+
+
+# Additional sequences when using the `best_of` parameter
+class BestOfSequence:
+    # Generated text
+    generated_text: str
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+    # Prompt tokens
+    prefill: List[PrefillToken]
+    # Generated tokens
+    tokens: List[Token]
+
+
+# `generate` details
+class Details:
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+    # Prompt tokens
+    prefill: List[PrefillToken]
+    # Generated tokens
+    tokens: List[Token]
+    # Additional sequences when using the `best_of` parameter
+    best_of_sequences: Optional[List[BestOfSequence]]
+
+
+# `generate` return value
+class Response:
+    # Generated text
+    generated_text: str
+    # Generation details
+    details: Details
+
+
+# `generate_stream` details
+class StreamDetails:
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+
+
+# `generate_stream` return value
+class StreamResponse:
+    # Generated token
+    token: Token
+    # Complete generated text
+    # Only available when the generation is finished
+    generated_text: Optional[str]
+    # Generation details
+    # Only available when the generation is finished
+    details: Optional[StreamDetails]
+```
\ No newline at end of file
diff --git a/clients/python/poetry.lock b/clients/python/poetry.lock
new file mode 100644
index 00000000..1503c40e
--- /dev/null
+++ b/clients/python/poetry.lock
@@ -0,0 +1,1038 @@
+[[package]]
+name = "aiohttp"
+version = "3.8.4"
+description = "Async http client/server framework (asyncio)"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+aiosignal = ">=1.1.2"
+async-timeout = ">=4.0.0a3,<5.0"
+asynctest = {version = "0.13.0", markers = "python_version < \"3.8\""}
+attrs = ">=17.3.0"
+charset-normalizer = ">=2.0,<4.0"
+frozenlist = ">=1.1.1"
+multidict = ">=4.5,<7.0"
+typing-extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""}
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["Brotli", "aiodns", "cchardet"]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.1"
+description = "aiosignal: a list of registered asynchronous callbacks"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+frozenlist = ">=1.1.0"
+
+[[package]]
+name = "async-timeout"
+version = "4.0.2"
+description = "Timeout context manager for asyncio programs"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""}
+
+[[package]]
+name = "asynctest"
+version = "0.13.0"
+description = "Enhance the standard unittest package with features for testing asyncio libraries"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "atomicwrites"
+version = "1.4.1"
+description = "Atomic file writes."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "22.2.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"]
+dev = ["attrs[docs,tests]"]
+docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"]
+tests = ["attrs[tests-no-zope]", "zope.interface"]
+tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=0.971,<0.990)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests_no_zope = ["cloudpickle", "hypothesis", "mypy (>=0.971,<0.990)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+
+[[package]]
+name = "certifi"
+version = "2022.12.7"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "charset-normalizer"
+version = "3.1.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+
+[[package]]
+name = "coverage"
+version = "7.2.1"
+description = "Code coverage measurement for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
+
+[package.extras]
+toml = ["tomli"]
+
+[[package]]
+name = "filelock"
+version = "3.10.0"
+description = "A platform independent file lock."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo (>=2022.12.7)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.2.1)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-timeout (>=2.1)"]
+
+[[package]]
+name = "frozenlist"
+version = "1.3.3"
+description = "A list-like structure which implements collections.abc.MutableSequence"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "huggingface-hub"
+version = "0.13.2"
+description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+filelock = "*"
+importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
+packaging = ">=20.9"
+pyyaml = ">=5.1"
+requests = "*"
+tqdm = ">=4.42.1"
+typing-extensions = ">=3.7.4.3"
+
+[package.extras]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+cli = ["InquirerPy (==0.3.4)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "black (>=23.1,<24.0)", "jedi", "mypy (==0.982)", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
+quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"]
+tensorflow = ["graphviz", "pydot", "tensorflow"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "jedi", "pytest", "pytest-cov", "pytest-env", "pytest-xdist", "soundfile"]
+torch = ["torch"]
+typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+
+[[package]]
+name = "idna"
+version = "3.4"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "importlib-metadata"
+version = "6.0.0"
+description = "Read metadata from Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
+zipp = ">=0.5"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+perf = ["ipython"]
+testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "multidict"
+version = "6.0.4"
+description = "multidict implementation"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "packaging"
+version = "23.0"
+description = "Core utilities for Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "pydantic"
+version = "1.10.6"
+description = "Data validation and settings management using python type hints"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = ">=4.2.0"
+
+[package.extras]
+dotenv = ["python-dotenv (>=0.10.4)"]
+email = ["email-validator (>=1.0.3)"]
+
+[[package]]
+name = "pytest"
+version = "6.2.5"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+py = ">=1.8.2"
+toml = "*"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
+[[package]]
+name = "pytest-asyncio"
+version = "0.17.2"
+description = "Pytest support for asyncio"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+pytest = ">=6.1.0"
+typing-extensions = {version = ">=4.0", markers = "python_version < \"3.8\""}
+
+[package.extras]
+testing = ["coverage (==6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (==0.931)"]
+
+[[package]]
+name = "pytest-cov"
+version = "3.0.0"
+description = "Pytest plugin for measuring coverage."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+coverage = {version = ">=5.2.1", extras = ["toml"]}
+pytest = ">=4.6"
+
+[package.extras]
+testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"]
+
+[[package]]
+name = "PyYAML"
+version = "6.0"
+description = "YAML parser and emitter for Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "requests"
+version = "2.28.2"
+description = "Python HTTP for Humans."
+category = "main"
+optional = false
+python-versions = ">=3.7, <4"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<1.27"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "tqdm"
+version = "4.65.0"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "typing-extensions"
+version = "4.5.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "urllib3"
+version = "1.26.15"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
+
+[[package]]
+name = "yarl"
+version = "1.8.2"
+description = "Yet another URL library"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+typing-extensions = {version = ">=3.7.4", markers = "python_version < \"3.8\""}
+
+[[package]]
+name = "zipp"
+version = "3.15.0"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+
+[metadata]
+lock-version = "1.1"
+python-versions = "^3.7"
+content-hash = "0db2f97d52c557dd7f90c55b4ad5bbe308c957c5f7f99fec53c57e0a13822cb4"
+
+[metadata.files]
+aiohttp = [
+    {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ce45967538fb747370308d3145aa68a074bdecb4f3a300869590f725ced69c1"},
+    {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b744c33b6f14ca26b7544e8d8aadff6b765a80ad6164fb1a430bbadd593dfb1a"},
+    {file = "aiohttp-3.8.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a45865451439eb320784918617ba54b7a377e3501fb70402ab84d38c2cd891b"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a86d42d7cba1cec432d47ab13b6637bee393a10f664c425ea7b305d1301ca1a3"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee3c36df21b5714d49fc4580247947aa64bcbe2939d1b77b4c8dcb8f6c9faecc"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:176a64b24c0935869d5bbc4c96e82f89f643bcdf08ec947701b9dbb3c956b7dd"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c844fd628851c0bc309f3c801b3a3d58ce430b2ce5b359cd918a5a76d0b20cb5"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5393fb786a9e23e4799fec788e7e735de18052f83682ce2dfcabaf1c00c2c08e"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e4b09863aae0dc965c3ef36500d891a3ff495a2ea9ae9171e4519963c12ceefd"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:adfbc22e87365a6e564c804c58fc44ff7727deea782d175c33602737b7feadb6"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:147ae376f14b55f4f3c2b118b95be50a369b89b38a971e80a17c3fd623f280c9"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:eafb3e874816ebe2a92f5e155f17260034c8c341dad1df25672fb710627c6949"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c6cc15d58053c76eacac5fa9152d7d84b8d67b3fde92709195cb984cfb3475ea"},
+    {file = "aiohttp-3.8.4-cp310-cp310-win32.whl", hash = "sha256:59f029a5f6e2d679296db7bee982bb3d20c088e52a2977e3175faf31d6fb75d1"},
+    {file = "aiohttp-3.8.4-cp310-cp310-win_amd64.whl", hash = "sha256:fe7ba4a51f33ab275515f66b0a236bcde4fb5561498fe8f898d4e549b2e4509f"},
+    {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3d8ef1a630519a26d6760bc695842579cb09e373c5f227a21b67dc3eb16cfea4"},
+    {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b3f2e06a512e94722886c0827bee9807c86a9f698fac6b3aee841fab49bbfb4"},
+    {file = "aiohttp-3.8.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3a80464982d41b1fbfe3154e440ba4904b71c1a53e9cd584098cd41efdb188ef"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b631e26df63e52f7cce0cce6507b7a7f1bc9b0c501fcde69742130b32e8782f"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f43255086fe25e36fd5ed8f2ee47477408a73ef00e804cb2b5cba4bf2ac7f5e"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d347a172f866cd1d93126d9b239fcbe682acb39b48ee0873c73c933dd23bd0f"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3fec6a4cb5551721cdd70473eb009d90935b4063acc5f40905d40ecfea23e05"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80a37fe8f7c1e6ce8f2d9c411676e4bc633a8462844e38f46156d07a7d401654"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d1e6a862b76f34395a985b3cd39a0d949ca80a70b6ebdea37d3ab39ceea6698a"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cd468460eefef601ece4428d3cf4562459157c0f6523db89365202c31b6daebb"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:618c901dd3aad4ace71dfa0f5e82e88b46ef57e3239fc7027773cb6d4ed53531"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:652b1bff4f15f6287550b4670546a2947f2a4575b6c6dff7760eafb22eacbf0b"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80575ba9377c5171407a06d0196b2310b679dc752d02a1fcaa2bc20b235dbf24"},
+    {file = "aiohttp-3.8.4-cp311-cp311-win32.whl", hash = "sha256:bbcf1a76cf6f6dacf2c7f4d2ebd411438c275faa1dc0c68e46eb84eebd05dd7d"},
+    {file = "aiohttp-3.8.4-cp311-cp311-win_amd64.whl", hash = "sha256:6e74dd54f7239fcffe07913ff8b964e28b712f09846e20de78676ce2a3dc0bfc"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:880e15bb6dad90549b43f796b391cfffd7af373f4646784795e20d92606b7a51"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb96fa6b56bb536c42d6a4a87dfca570ff8e52de2d63cabebfd6fb67049c34b6"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a6cadebe132e90cefa77e45f2d2f1a4b2ce5c6b1bfc1656c1ddafcfe4ba8131"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f352b62b45dff37b55ddd7b9c0c8672c4dd2eb9c0f9c11d395075a84e2c40f75"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ab43061a0c81198d88f39aaf90dae9a7744620978f7ef3e3708339b8ed2ef01"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9cb1565a7ad52e096a6988e2ee0397f72fe056dadf75d17fa6b5aebaea05622"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:1b3ea7edd2d24538959c1c1abf97c744d879d4e541d38305f9bd7d9b10c9ec41"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:7c7837fe8037e96b6dd5cfcf47263c1620a9d332a87ec06a6ca4564e56bd0f36"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3b90467ebc3d9fa5b0f9b6489dfb2c304a1db7b9946fa92aa76a831b9d587e99"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:cab9401de3ea52b4b4c6971db5fb5c999bd4260898af972bf23de1c6b5dd9d71"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d1f9282c5f2b5e241034a009779e7b2a1aa045f667ff521e7948ea9b56e0c5ff"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-win32.whl", hash = "sha256:5e14f25765a578a0a634d5f0cd1e2c3f53964553a00347998dfdf96b8137f777"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-win_amd64.whl", hash = "sha256:4c745b109057e7e5f1848c689ee4fb3a016c8d4d92da52b312f8a509f83aa05e"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:aede4df4eeb926c8fa70de46c340a1bc2c6079e1c40ccf7b0eae1313ffd33519"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddaae3f3d32fc2cb4c53fab020b69a05c8ab1f02e0e59665c6f7a0d3a5be54f"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4eb3b82ca349cf6fadcdc7abcc8b3a50ab74a62e9113ab7a8ebc268aad35bb9"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bcb89336efa095ea21b30f9e686763f2be4478f1b0a616969551982c4ee4c3b"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c08e8ed6fa3d477e501ec9db169bfac8140e830aa372d77e4a43084d8dd91ab"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6cd05ea06daca6ad6a4ca3ba7fe7dc5b5de063ff4daec6170ec0f9979f6c332"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7a00a9ed8d6e725b55ef98b1b35c88013245f35f68b1b12c5cd4100dddac333"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:de04b491d0e5007ee1b63a309956eaed959a49f5bb4e84b26c8f5d49de140fa9"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:40653609b3bf50611356e6b6554e3a331f6879fa7116f3959b20e3528783e699"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dbf3a08a06b3f433013c143ebd72c15cac33d2914b8ea4bea7ac2c23578815d6"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:854f422ac44af92bfe172d8e73229c270dc09b96535e8a548f99c84f82dde241"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-win32.whl", hash = "sha256:aeb29c84bb53a84b1a81c6c09d24cf33bb8432cc5c39979021cc0f98c1292a1a"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-win_amd64.whl", hash = "sha256:db3fc6120bce9f446d13b1b834ea5b15341ca9ff3f335e4a951a6ead31105480"},
+    {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fabb87dd8850ef0f7fe2b366d44b77d7e6fa2ea87861ab3844da99291e81e60f"},
+    {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:91f6d540163f90bbaef9387e65f18f73ffd7c79f5225ac3d3f61df7b0d01ad15"},
+    {file = "aiohttp-3.8.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d265f09a75a79a788237d7f9054f929ced2e69eb0bb79de3798c468d8a90f945"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d89efa095ca7d442a6d0cbc755f9e08190ba40069b235c9886a8763b03785da"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4dac314662f4e2aa5009977b652d9b8db7121b46c38f2073bfeed9f4049732cd"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe11310ae1e4cd560035598c3f29d86cef39a83d244c7466f95c27ae04850f10"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ddb2a2026c3f6a68c3998a6c47ab6795e4127315d2e35a09997da21865757f8"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e75b89ac3bd27d2d043b234aa7b734c38ba1b0e43f07787130a0ecac1e12228a"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6e601588f2b502c93c30cd5a45bfc665faaf37bbe835b7cfd461753068232074"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a5d794d1ae64e7753e405ba58e08fcfa73e3fad93ef9b7e31112ef3c9a0efb52"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a1f4689c9a1462f3df0a1f7e797791cd6b124ddbee2b570d34e7f38ade0e2c71"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3032dcb1c35bc330134a5b8a5d4f68c1a87252dfc6e1262c65a7e30e62298275"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8189c56eb0ddbb95bfadb8f60ea1b22fcfa659396ea36f6adcc521213cd7b44d"},
+    {file = "aiohttp-3.8.4-cp38-cp38-win32.whl", hash = "sha256:33587f26dcee66efb2fff3c177547bd0449ab7edf1b73a7f5dea1e38609a0c54"},
+    {file = "aiohttp-3.8.4-cp38-cp38-win_amd64.whl", hash = "sha256:e595432ac259af2d4630008bf638873d69346372d38255774c0e286951e8b79f"},
+    {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5a7bdf9e57126dc345b683c3632e8ba317c31d2a41acd5800c10640387d193ed"},
+    {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:22f6eab15b6db242499a16de87939a342f5a950ad0abaf1532038e2ce7d31567"},
+    {file = "aiohttp-3.8.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7235604476a76ef249bd64cb8274ed24ccf6995c4a8b51a237005ee7a57e8643"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea9eb976ffdd79d0e893869cfe179a8f60f152d42cb64622fca418cd9b18dc2a"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92c0cea74a2a81c4c76b62ea1cac163ecb20fb3ba3a75c909b9fa71b4ad493cf"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:493f5bc2f8307286b7799c6d899d388bbaa7dfa6c4caf4f97ef7521b9cb13719"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a63f03189a6fa7c900226e3ef5ba4d3bd047e18f445e69adbd65af433add5a2"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10c8cefcff98fd9168cdd86c4da8b84baaa90bf2da2269c6161984e6737bf23e"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bca5f24726e2919de94f047739d0a4fc01372801a3672708260546aa2601bf57"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:03baa76b730e4e15a45f81dfe29a8d910314143414e528737f8589ec60cf7391"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c29c77cc57e40f84acef9bfb904373a4e89a4e8b74e71aa8075c021ec9078c2"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:03543dcf98a6619254b409be2d22b51f21ec66272be4ebda7b04e6412e4b2e14"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:17b79c2963db82086229012cff93ea55196ed31f6493bb1ccd2c62f1724324e4"},
+    {file = "aiohttp-3.8.4-cp39-cp39-win32.whl", hash = "sha256:34ce9f93a4a68d1272d26030655dd1b58ff727b3ed2a33d80ec433561b03d67a"},
+    {file = "aiohttp-3.8.4-cp39-cp39-win_amd64.whl", hash = "sha256:41a86a69bb63bb2fc3dc9ad5ea9f10f1c9c8e282b471931be0268ddd09430b04"},
+    {file = "aiohttp-3.8.4.tar.gz", hash = "sha256:bf2e1a9162c1e441bf805a1fd166e249d574ca04e03b34f97e2928769e91ab5c"},
+]
+aiosignal = [
+    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
+    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+]
+async-timeout = [
+    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
+    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+]
+asynctest = [
+    {file = "asynctest-0.13.0-py3-none-any.whl", hash = "sha256:5da6118a7e6d6b54d83a8f7197769d046922a44d2a99c21382f0a6e4fadae676"},
+    {file = "asynctest-0.13.0.tar.gz", hash = "sha256:c27862842d15d83e6a34eb0b2866c323880eb3a75e4485b079ea11748fd77fac"},
+]
+atomicwrites = [
+    {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"},
+]
+attrs = [
+    {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"},
+    {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"},
+]
+certifi = [
+    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
+    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
+]
+charset-normalizer = [
+    {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"},
+    {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"},
+]
+colorama = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+coverage = [
+    {file = "coverage-7.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:49567ec91fc5e0b15356da07a2feabb421d62f52a9fff4b1ec40e9e19772f5f8"},
+    {file = "coverage-7.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d2ef6cae70168815ed91388948b5f4fcc69681480a0061114db737f957719f03"},
+    {file = "coverage-7.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3004765bca3acd9e015794e5c2f0c9a05587f5e698127ff95e9cfba0d3f29339"},
+    {file = "coverage-7.2.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cca7c0b7f5881dfe0291ef09ba7bb1582cb92ab0aeffd8afb00c700bf692415a"},
+    {file = "coverage-7.2.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2167d116309f564af56f9aa5e75ef710ef871c5f9b313a83050035097b56820"},
+    {file = "coverage-7.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:cb5f152fb14857cbe7f3e8c9a5d98979c4c66319a33cad6e617f0067c9accdc4"},
+    {file = "coverage-7.2.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:87dc37f16fb5e3a28429e094145bf7c1753e32bb50f662722e378c5851f7fdc6"},
+    {file = "coverage-7.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e191a63a05851f8bce77bc875e75457f9b01d42843f8bd7feed2fc26bbe60833"},
+    {file = "coverage-7.2.1-cp310-cp310-win32.whl", hash = "sha256:e3ea04b23b114572b98a88c85379e9e9ae031272ba1fb9b532aa934c621626d4"},
+    {file = "coverage-7.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:0cf557827be7eca1c38a2480484d706693e7bb1929e129785fe59ec155a59de6"},
+    {file = "coverage-7.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:570c21a29493b350f591a4b04c158ce1601e8d18bdcd21db136fbb135d75efa6"},
+    {file = "coverage-7.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9e872b082b32065ac2834149dc0adc2a2e6d8203080501e1e3c3c77851b466f9"},
+    {file = "coverage-7.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fac6343bae03b176e9b58104a9810df3cdccd5cfed19f99adfa807ffbf43cf9b"},
+    {file = "coverage-7.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abacd0a738e71b20e224861bc87e819ef46fedba2fb01bc1af83dfd122e9c319"},
+    {file = "coverage-7.2.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9256d4c60c4bbfec92721b51579c50f9e5062c21c12bec56b55292464873508"},
+    {file = "coverage-7.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:80559eaf6c15ce3da10edb7977a1548b393db36cbc6cf417633eca05d84dd1ed"},
+    {file = "coverage-7.2.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bd7e628f6c3ec4e7d2d24ec0e50aae4e5ae95ea644e849d92ae4805650b4c4e"},
+    {file = "coverage-7.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:09643fb0df8e29f7417adc3f40aaf379d071ee8f0350ab290517c7004f05360b"},
+    {file = "coverage-7.2.1-cp311-cp311-win32.whl", hash = "sha256:1b7fb13850ecb29b62a447ac3516c777b0e7a09ecb0f4bb6718a8654c87dfc80"},
+    {file = "coverage-7.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:617a94ada56bbfe547aa8d1b1a2b8299e2ec1ba14aac1d4b26a9f7d6158e1273"},
+    {file = "coverage-7.2.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8649371570551d2fd7dee22cfbf0b61f1747cdfb2b7587bb551e4beaaa44cb97"},
+    {file = "coverage-7.2.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d2b9b5e70a21474c105a133ba227c61bc95f2ac3b66861143ce39a5ea4b3f84"},
+    {file = "coverage-7.2.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae82c988954722fa07ec5045c57b6d55bc1a0890defb57cf4a712ced65b26ddd"},
+    {file = "coverage-7.2.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:861cc85dfbf55a7a768443d90a07e0ac5207704a9f97a8eb753292a7fcbdfcfc"},
+    {file = "coverage-7.2.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0339dc3237c0d31c3b574f19c57985fcbe494280153bbcad33f2cdf469f4ac3e"},
+    {file = "coverage-7.2.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:5928b85416a388dd557ddc006425b0c37e8468bd1c3dc118c1a3de42f59e2a54"},
+    {file = "coverage-7.2.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8d3843ca645f62c426c3d272902b9de90558e9886f15ddf5efe757b12dd376f5"},
+    {file = "coverage-7.2.1-cp37-cp37m-win32.whl", hash = "sha256:6a034480e9ebd4e83d1aa0453fd78986414b5d237aea89a8fdc35d330aa13bae"},
+    {file = "coverage-7.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:6fce673f79a0e017a4dc35e18dc7bb90bf6d307c67a11ad5e61ca8d42b87cbff"},
+    {file = "coverage-7.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7f099da6958ddfa2ed84bddea7515cb248583292e16bb9231d151cd528eab657"},
+    {file = "coverage-7.2.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:97a3189e019d27e914ecf5c5247ea9f13261d22c3bb0cfcfd2a9b179bb36f8b1"},
+    {file = "coverage-7.2.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a81dbcf6c6c877986083d00b834ac1e84b375220207a059ad45d12f6e518a4e3"},
+    {file = "coverage-7.2.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d2c3dde4c0b9be4b02067185136b7ee4681978228ad5ec1278fa74f5ca3e99"},
+    {file = "coverage-7.2.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a209d512d157379cc9ab697cbdbb4cfd18daa3e7eebaa84c3d20b6af0037384"},
+    {file = "coverage-7.2.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f3d07edb912a978915576a776756069dede66d012baa503022d3a0adba1b6afa"},
+    {file = "coverage-7.2.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8dca3c1706670297851bca1acff9618455122246bdae623be31eca744ade05ec"},
+    {file = "coverage-7.2.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b1991a6d64231a3e5bbe3099fb0dd7c9aeaa4275ad0e0aeff4cb9ef885c62ba2"},
+    {file = "coverage-7.2.1-cp38-cp38-win32.whl", hash = "sha256:22c308bc508372576ffa3d2dbc4824bb70d28eeb4fcd79d4d1aed663a06630d0"},
+    {file = "coverage-7.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:b0c0d46de5dd97f6c2d1b560bf0fcf0215658097b604f1840365296302a9d1fb"},
+    {file = "coverage-7.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4dd34a935de268a133e4741827ae951283a28c0125ddcdbcbba41c4b98f2dfef"},
+    {file = "coverage-7.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0f8318ed0f3c376cfad8d3520f496946977abde080439d6689d7799791457454"},
+    {file = "coverage-7.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:834c2172edff5a08d78e2f53cf5e7164aacabeb66b369f76e7bb367ca4e2d993"},
+    {file = "coverage-7.2.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4d70c853f0546855f027890b77854508bdb4d6a81242a9d804482e667fff6e6"},
+    {file = "coverage-7.2.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a6450da4c7afc4534305b2b7d8650131e130610cea448ff240b6ab73d7eab63"},
+    {file = "coverage-7.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:99f4dd81b2bb8fc67c3da68b1f5ee1650aca06faa585cbc6818dbf67893c6d58"},
+    {file = "coverage-7.2.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bdd3f2f285ddcf2e75174248b2406189261a79e7fedee2ceeadc76219b6faa0e"},
+    {file = "coverage-7.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f29351393eb05e6326f044a7b45ed8e38cb4dcc38570d12791f271399dc41431"},
+    {file = "coverage-7.2.1-cp39-cp39-win32.whl", hash = "sha256:e2b50ebc2b6121edf352336d503357321b9d8738bb7a72d06fc56153fd3f4cd8"},
+    {file = "coverage-7.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:bd5a12239c0006252244f94863f1c518ac256160cd316ea5c47fb1a11b25889a"},
+    {file = "coverage-7.2.1-pp37.pp38.pp39-none-any.whl", hash = "sha256:436313d129db7cf5b4ac355dd2bd3f7c7e5294af077b090b85de75f8458b8616"},
+    {file = "coverage-7.2.1.tar.gz", hash = "sha256:c77f2a9093ccf329dd523a9b2b3c854c20d2a3d968b6def3b820272ca6732242"},
+]
+filelock = [
+    {file = "filelock-3.10.0-py3-none-any.whl", hash = "sha256:e90b34656470756edf8b19656785c5fea73afa1953f3e1b0d645cef11cab3182"},
+    {file = "filelock-3.10.0.tar.gz", hash = "sha256:3199fd0d3faea8b911be52b663dfccceb84c95949dd13179aa21436d1a79c4ce"},
+]
+frozenlist = [
+    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"},
+    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0"},
+    {file = "frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420"},
+    {file = "frozenlist-1.3.3-cp310-cp310-win32.whl", hash = "sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642"},
+    {file = "frozenlist-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1"},
+    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7"},
+    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678"},
+    {file = "frozenlist-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81"},
+    {file = "frozenlist-1.3.3-cp311-cp311-win32.whl", hash = "sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8"},
+    {file = "frozenlist-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-win32.whl", hash = "sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411"},
+    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a"},
+    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5"},
+    {file = "frozenlist-1.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3"},
+    {file = "frozenlist-1.3.3-cp38-cp38-win32.whl", hash = "sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b"},
+    {file = "frozenlist-1.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef"},
+    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf"},
+    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1"},
+    {file = "frozenlist-1.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1"},
+    {file = "frozenlist-1.3.3-cp39-cp39-win32.whl", hash = "sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38"},
+    {file = "frozenlist-1.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9"},
+    {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
+]
+huggingface-hub = [
+    {file = "huggingface_hub-0.13.2-py3-none-any.whl", hash = "sha256:745c4cbd97a27fc5c1c6c89cb477662004c88bc3dd89bafc1a27ef24af77f944"},
+    {file = "huggingface_hub-0.13.2.tar.gz", hash = "sha256:246e8eb39b6e6e9d9d5846e4b56c265cdf1872f48ba5a13a1321295d371626f5"},
+]
+idna = [
+    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
+    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+]
+importlib-metadata = [
+    {file = "importlib_metadata-6.0.0-py3-none-any.whl", hash = "sha256:7efb448ec9a5e313a57655d35aa54cd3e01b7e1fbcf72dce1bf06119420f5bad"},
+    {file = "importlib_metadata-6.0.0.tar.gz", hash = "sha256:e354bedeb60efa6affdcc8ae121b73544a7aa74156d047311948f6d711cd378d"},
+]
+iniconfig = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+multidict = [
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
+    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
+    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
+    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
+    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
+    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
+    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
+    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
+    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
+    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
+    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
+    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
+    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+]
+packaging = [
+    {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"},
+    {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
+]
+pluggy = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
+py = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
+pydantic = [
+    {file = "pydantic-1.10.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f9289065611c48147c1dd1fd344e9d57ab45f1d99b0fb26c51f1cf72cd9bcd31"},
+    {file = "pydantic-1.10.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c32b6bba301490d9bb2bf5f631907803135e8085b6aa3e5fe5a770d46dd0160"},
+    {file = "pydantic-1.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd9b9e98068fa1068edfc9eabde70a7132017bdd4f362f8b4fd0abed79c33083"},
+    {file = "pydantic-1.10.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c84583b9df62522829cbc46e2b22e0ec11445625b5acd70c5681ce09c9b11c4"},
+    {file = "pydantic-1.10.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b41822064585fea56d0116aa431fbd5137ce69dfe837b599e310034171996084"},
+    {file = "pydantic-1.10.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:61f1f08adfaa9cc02e0cbc94f478140385cbd52d5b3c5a657c2fceb15de8d1fb"},
+    {file = "pydantic-1.10.6-cp310-cp310-win_amd64.whl", hash = "sha256:32937835e525d92c98a1512218db4eed9ddc8f4ee2a78382d77f54341972c0e7"},
+    {file = "pydantic-1.10.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bbd5c531b22928e63d0cb1868dee76123456e1de2f1cb45879e9e7a3f3f1779b"},
+    {file = "pydantic-1.10.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e277bd18339177daa62a294256869bbe84df1fb592be2716ec62627bb8d7c81d"},
+    {file = "pydantic-1.10.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f15277d720aa57e173954d237628a8d304896364b9de745dcb722f584812c7"},
+    {file = "pydantic-1.10.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b243b564cea2576725e77aeeda54e3e0229a168bc587d536cd69941e6797543d"},
+    {file = "pydantic-1.10.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3ce13a558b484c9ae48a6a7c184b1ba0e5588c5525482681db418268e5f86186"},
+    {file = "pydantic-1.10.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3ac1cd4deed871dfe0c5f63721e29debf03e2deefa41b3ed5eb5f5df287c7b70"},
+    {file = "pydantic-1.10.6-cp311-cp311-win_amd64.whl", hash = "sha256:b1eb6610330a1dfba9ce142ada792f26bbef1255b75f538196a39e9e90388bf4"},
+    {file = "pydantic-1.10.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4ca83739c1263a044ec8b79df4eefc34bbac87191f0a513d00dd47d46e307a65"},
+    {file = "pydantic-1.10.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea4e2a7cb409951988e79a469f609bba998a576e6d7b9791ae5d1e0619e1c0f2"},
+    {file = "pydantic-1.10.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53de12b4608290992a943801d7756f18a37b7aee284b9ffa794ee8ea8153f8e2"},
+    {file = "pydantic-1.10.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:60184e80aac3b56933c71c48d6181e630b0fbc61ae455a63322a66a23c14731a"},
+    {file = "pydantic-1.10.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:415a3f719ce518e95a92effc7ee30118a25c3d032455d13e121e3840985f2efd"},
+    {file = "pydantic-1.10.6-cp37-cp37m-win_amd64.whl", hash = "sha256:72cb30894a34d3a7ab6d959b45a70abac8a2a93b6480fc5a7bfbd9c935bdc4fb"},
+    {file = "pydantic-1.10.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3091d2eaeda25391405e36c2fc2ed102b48bac4b384d42b2267310abae350ca6"},
+    {file = "pydantic-1.10.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:751f008cd2afe812a781fd6aa2fb66c620ca2e1a13b6a2152b1ad51553cb4b77"},
+    {file = "pydantic-1.10.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12e837fd320dd30bd625be1b101e3b62edc096a49835392dcf418f1a5ac2b832"},
+    {file = "pydantic-1.10.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:587d92831d0115874d766b1f5fddcdde0c5b6c60f8c6111a394078ec227fca6d"},
+    {file = "pydantic-1.10.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:476f6674303ae7965730a382a8e8d7fae18b8004b7b69a56c3d8fa93968aa21c"},
+    {file = "pydantic-1.10.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3a2be0a0f32c83265fd71a45027201e1278beaa82ea88ea5b345eea6afa9ac7f"},
+    {file = "pydantic-1.10.6-cp38-cp38-win_amd64.whl", hash = "sha256:0abd9c60eee6201b853b6c4be104edfba4f8f6c5f3623f8e1dba90634d63eb35"},
+    {file = "pydantic-1.10.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6195ca908045054dd2d57eb9c39a5fe86409968b8040de8c2240186da0769da7"},
+    {file = "pydantic-1.10.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43cdeca8d30de9a897440e3fb8866f827c4c31f6c73838e3a01a14b03b067b1d"},
+    {file = "pydantic-1.10.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c19eb5163167489cb1e0161ae9220dadd4fc609a42649e7e84a8fa8fff7a80f"},
+    {file = "pydantic-1.10.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:012c99a9c0d18cfde7469aa1ebff922e24b0c706d03ead96940f5465f2c9cf62"},
+    {file = "pydantic-1.10.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:528dcf7ec49fb5a84bf6fe346c1cc3c55b0e7603c2123881996ca3ad79db5bfc"},
+    {file = "pydantic-1.10.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:163e79386c3547c49366e959d01e37fc30252285a70619ffc1b10ede4758250a"},
+    {file = "pydantic-1.10.6-cp39-cp39-win_amd64.whl", hash = "sha256:189318051c3d57821f7233ecc94708767dd67687a614a4e8f92b4a020d4ffd06"},
+    {file = "pydantic-1.10.6-py3-none-any.whl", hash = "sha256:acc6783751ac9c9bc4680379edd6d286468a1dc8d7d9906cd6f1186ed682b2b0"},
+    {file = "pydantic-1.10.6.tar.gz", hash = "sha256:cf95adb0d1671fc38d8c43dd921ad5814a735e7d9b4d9e437c088002863854fd"},
+]
+pytest = [
+    {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"},
+    {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"},
+]
+pytest-asyncio = [
+    {file = "pytest-asyncio-0.17.2.tar.gz", hash = "sha256:6d895b02432c028e6957d25fc936494e78c6305736e785d9fee408b1efbc7ff4"},
+    {file = "pytest_asyncio-0.17.2-py3-none-any.whl", hash = "sha256:e0fe5dbea40516b661ef1bcfe0bd9461c2847c4ef4bb40012324f2454fb7d56d"},
+]
+pytest-cov = [
+    {file = "pytest-cov-3.0.0.tar.gz", hash = "sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470"},
+    {file = "pytest_cov-3.0.0-py3-none-any.whl", hash = "sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6"},
+]
+PyYAML = [
+    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
+    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
+    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
+    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
+    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
+    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
+    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
+    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
+    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
+    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
+    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
+    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
+    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
+    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
+    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+]
+requests = [
+    {file = "requests-2.28.2-py3-none-any.whl", hash = "sha256:64299f4909223da747622c030b781c0d7811e359c37124b4bd368fb8c6518baa"},
+    {file = "requests-2.28.2.tar.gz", hash = "sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf"},
+]
+toml = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+tqdm = [
+    {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"},
+    {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"},
+]
+typing-extensions = [
+    {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"},
+    {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"},
+]
+urllib3 = [
+    {file = "urllib3-1.26.15-py2.py3-none-any.whl", hash = "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42"},
+    {file = "urllib3-1.26.15.tar.gz", hash = "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305"},
+]
+yarl = [
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
+    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
+    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
+    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
+    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
+    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
+    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
+    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
+    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
+    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
+    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
+    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
+    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+]
+zipp = [
+    {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"},
+    {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"},
+]
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
new file mode 100644
index 00000000..2aa396b8
--- /dev/null
+++ b/clients/python/pyproject.toml
@@ -0,0 +1,26 @@
+[tool.poetry]
+name = "text-generation"
+version = "0.3.1"
+description = "Hugging Face Text Generation Python Client"
+license = "Apache-2.0"
+authors = ["Olivier Dehaene <olivier@huggingface.co>"]
+maintainers = ["Olivier Dehaene <olivier@huggingface.co>"]
+readme = "README.md"
+homepage = "https://github.com/huggingface/text-generation-inference"
+repository = "https://github.com/huggingface/text-generation-inference"
+
+
+[tool.poetry.dependencies]
+python = "^3.7"
+pydantic = "^1.10"
+aiohttp = "^3.8"
+huggingface-hub = ">= 0.12, < 1.0"
+
+[tool.poetry.dev-dependencies]
+pytest = "^6.2.5"
+pytest-asyncio = "^0.17.2"
+pytest-cov = "^3.0.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/clients/python/tests/conftest.py b/clients/python/tests/conftest.py
new file mode 100644
index 00000000..48734f0d
--- /dev/null
+++ b/clients/python/tests/conftest.py
@@ -0,0 +1,51 @@
+import pytest
+
+from text_generation import __version__
+from huggingface_hub.utils import build_hf_headers
+
+
+@pytest.fixture
+def flan_t5_xxl():
+    return "google/flan-t5-xxl"
+
+
+@pytest.fixture
+def fake_model():
+    return "fake/model"
+
+
+@pytest.fixture
+def unsupported_model():
+    return "gpt2"
+
+
+@pytest.fixture
+def base_url():
+    return "https://api-inference.huggingface.co/models"
+
+
+@pytest.fixture
+def bloom_url(base_url, bloom_model):
+    return f"{base_url}/{bloom_model}"
+
+
+@pytest.fixture
+def flan_t5_xxl_url(base_url, flan_t5_xxl):
+    return f"{base_url}/{flan_t5_xxl}"
+
+
+@pytest.fixture
+def fake_url(base_url, fake_model):
+    return f"{base_url}/{fake_model}"
+
+
+@pytest.fixture
+def unsupported_url(base_url, unsupported_model):
+    return f"{base_url}/{unsupported_model}"
+
+
+@pytest.fixture(scope="session")
+def hf_headers():
+    return build_hf_headers(
+        library_name="text-generation-tests", library_version=__version__
+    )
diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
new file mode 100644
index 00000000..c998de41
--- /dev/null
+++ b/clients/python/tests/test_client.py
@@ -0,0 +1,133 @@
+import pytest
+
+from text_generation import Client, AsyncClient
+from text_generation.errors import NotFoundError, ValidationError
+from text_generation.types import FinishReason, PrefillToken, Token
+
+
+def test_generate(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
+    response = client.generate("test", max_new_tokens=1)
+
+    assert response.generated_text == ""
+    assert response.details.finish_reason == FinishReason.Length
+    assert response.details.generated_tokens == 1
+    assert response.details.seed is None
+    assert len(response.details.prefill) == 1
+    assert response.details.prefill[0] == PrefillToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.tokens) == 1
+    assert response.details.tokens[0] == Token(
+        id=3, text=" ", logprob=-1.984375, special=False
+    )
+
+
+def test_generate_best_of(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
+    response = client.generate("test", max_new_tokens=1, best_of=2, do_sample=True)
+
+    assert response.details.seed is not None
+    assert response.details.best_of_sequences is not None
+    assert len(response.details.best_of_sequences) == 1
+    assert response.details.best_of_sequences[0].seed is not None
+
+
+def test_generate_not_found(fake_url, hf_headers):
+    client = Client(fake_url, hf_headers)
+    with pytest.raises(NotFoundError):
+        client.generate("test")
+
+
+def test_generate_validation_error(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
+    with pytest.raises(ValidationError):
+        client.generate("test", max_new_tokens=10_000)
+
+
+def test_generate_stream(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
+    responses = [
+        response for response in client.generate_stream("test", max_new_tokens=1)
+    ]
+
+    assert len(responses) == 1
+    response = responses[0]
+
+    assert response.generated_text == ""
+    assert response.details.finish_reason == FinishReason.Length
+    assert response.details.generated_tokens == 1
+    assert response.details.seed is None
+
+
+def test_generate_stream_not_found(fake_url, hf_headers):
+    client = Client(fake_url, hf_headers)
+    with pytest.raises(NotFoundError):
+        list(client.generate_stream("test"))
+
+
+def test_generate_stream_validation_error(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
+    with pytest.raises(ValidationError):
+        list(client.generate_stream("test", max_new_tokens=10_000))
+
+
+@pytest.mark.asyncio
+async def test_generate_async(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+    response = await client.generate("test", max_new_tokens=1)
+
+    assert response.generated_text == ""
+    assert response.details.finish_reason == FinishReason.Length
+    assert response.details.generated_tokens == 1
+    assert response.details.seed is None
+    assert len(response.details.prefill) == 1
+    assert response.details.prefill[0] == PrefillToken(id=0, text="<pad>", logprob=None)
+    assert len(response.details.tokens) == 1
+    assert response.details.tokens[0] == Token(
+        id=3, text=" ", logprob=-1.984375, special=False
+    )
+
+
+@pytest.mark.asyncio
+async def test_generate_async_not_found(fake_url, hf_headers):
+    client = AsyncClient(fake_url, hf_headers)
+    with pytest.raises(NotFoundError):
+        await client.generate("test")
+
+
+@pytest.mark.asyncio
+async def test_generate_async_validation_error(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+    with pytest.raises(ValidationError):
+        await client.generate("test", max_new_tokens=10_000)
+
+
+@pytest.mark.asyncio
+async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+    responses = [
+        response async for response in client.generate_stream("test", max_new_tokens=1)
+    ]
+
+    assert len(responses) == 1
+    response = responses[0]
+
+    assert response.generated_text == ""
+    assert response.details.finish_reason == FinishReason.Length
+    assert response.details.generated_tokens == 1
+    assert response.details.seed is None
+
+
+@pytest.mark.asyncio
+async def test_generate_stream_async_not_found(fake_url, hf_headers):
+    client = AsyncClient(fake_url, hf_headers)
+    with pytest.raises(NotFoundError):
+        async for _ in client.generate_stream("test"):
+            pass
+
+
+@pytest.mark.asyncio
+async def test_generate_stream_async_validation_error(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
+    with pytest.raises(ValidationError):
+        async for _ in client.generate_stream("test", max_new_tokens=10_000):
+            pass
diff --git a/clients/python/tests/test_errors.py b/clients/python/tests/test_errors.py
new file mode 100644
index 00000000..8389ed31
--- /dev/null
+++ b/clients/python/tests/test_errors.py
@@ -0,0 +1,64 @@
+from text_generation.errors import (
+    parse_error,
+    GenerationError,
+    IncompleteGenerationError,
+    OverloadedError,
+    ValidationError,
+    BadRequestError,
+    ShardNotReadyError,
+    ShardTimeoutError,
+    NotFoundError,
+    RateLimitExceededError,
+    UnknownError,
+)
+
+
+def test_generation_error():
+    payload = {"error_type": "generation", "error": "test"}
+    assert isinstance(parse_error(400, payload), GenerationError)
+
+
+def test_incomplete_generation_error():
+    payload = {"error_type": "incomplete_generation", "error": "test"}
+    assert isinstance(parse_error(400, payload), IncompleteGenerationError)
+
+
+def test_overloaded_error():
+    payload = {"error_type": "overloaded", "error": "test"}
+    assert isinstance(parse_error(400, payload), OverloadedError)
+
+
+def test_validation_error():
+    payload = {"error_type": "validation", "error": "test"}
+    assert isinstance(parse_error(400, payload), ValidationError)
+
+
+def test_bad_request_error():
+    payload = {"error": "test"}
+    assert isinstance(parse_error(400, payload), BadRequestError)
+
+
+def test_shard_not_ready_error():
+    payload = {"error": "test"}
+    assert isinstance(parse_error(403, payload), ShardNotReadyError)
+    assert isinstance(parse_error(424, payload), ShardNotReadyError)
+
+
+def test_shard_timeout_error():
+    payload = {"error": "test"}
+    assert isinstance(parse_error(504, payload), ShardTimeoutError)
+
+
+def test_not_found_error():
+    payload = {"error": "test"}
+    assert isinstance(parse_error(404, payload), NotFoundError)
+
+
+def test_rate_limit_exceeded_error():
+    payload = {"error": "test"}
+    assert isinstance(parse_error(429, payload), RateLimitExceededError)
+
+
+def test_unknown_error():
+    payload = {"error": "test"}
+    assert isinstance(parse_error(500, payload), UnknownError)
diff --git a/clients/python/tests/test_inference_api.py b/clients/python/tests/test_inference_api.py
new file mode 100644
index 00000000..79e503a3
--- /dev/null
+++ b/clients/python/tests/test_inference_api.py
@@ -0,0 +1,34 @@
+import pytest
+
+from text_generation import (
+    InferenceAPIClient,
+    InferenceAPIAsyncClient,
+    Client,
+    AsyncClient,
+)
+from text_generation.errors import NotSupportedError
+from text_generation.inference_api import get_supported_models
+
+
+def test_get_supported_models():
+    assert isinstance(get_supported_models(), list)
+
+
+def test_client(flan_t5_xxl):
+    client = InferenceAPIClient(flan_t5_xxl)
+    assert isinstance(client, Client)
+
+
+def test_client_unsupported_model(unsupported_model):
+    with pytest.raises(NotSupportedError):
+        InferenceAPIClient(unsupported_model)
+
+
+def test_async_client(flan_t5_xxl):
+    client = InferenceAPIAsyncClient(flan_t5_xxl)
+    assert isinstance(client, AsyncClient)
+
+
+def test_async_client_unsupported_model(unsupported_model):
+    with pytest.raises(NotSupportedError):
+        InferenceAPIAsyncClient(unsupported_model)
diff --git a/clients/python/tests/test_types.py b/clients/python/tests/test_types.py
new file mode 100644
index 00000000..4c9d4c89
--- /dev/null
+++ b/clients/python/tests/test_types.py
@@ -0,0 +1,82 @@
+import pytest
+
+from text_generation.types import Parameters, Request
+from text_generation.errors import ValidationError
+
+
+def test_parameters_validation():
+    # Test best_of
+    Parameters(best_of=1)
+    with pytest.raises(ValidationError):
+        Parameters(best_of=0)
+    with pytest.raises(ValidationError):
+        Parameters(best_of=-1)
+    Parameters(best_of=2, do_sample=True)
+    with pytest.raises(ValidationError):
+        Parameters(best_of=2)
+
+    # Test repetition_penalty
+    Parameters(repetition_penalty=1)
+    with pytest.raises(ValidationError):
+        Parameters(repetition_penalty=0)
+    with pytest.raises(ValidationError):
+        Parameters(repetition_penalty=-1)
+
+    # Test seed
+    Parameters(seed=1)
+    with pytest.raises(ValidationError):
+        Parameters(seed=-1)
+
+    # Test temperature
+    Parameters(temperature=1)
+    with pytest.raises(ValidationError):
+        Parameters(temperature=0)
+    with pytest.raises(ValidationError):
+        Parameters(temperature=-1)
+
+    # Test top_k
+    Parameters(top_k=1)
+    with pytest.raises(ValidationError):
+        Parameters(top_k=0)
+    with pytest.raises(ValidationError):
+        Parameters(top_k=-1)
+
+    # Test top_p
+    Parameters(top_p=0.5)
+    with pytest.raises(ValidationError):
+        Parameters(top_p=0)
+    with pytest.raises(ValidationError):
+        Parameters(top_p=-1)
+    with pytest.raises(ValidationError):
+        Parameters(top_p=1)
+
+    # Test truncate
+    Parameters(truncate=1)
+    with pytest.raises(ValidationError):
+        Parameters(truncate=0)
+    with pytest.raises(ValidationError):
+        Parameters(truncate=-1)
+
+    # Test typical_p
+    Parameters(typical_p=0.5)
+    with pytest.raises(ValidationError):
+        Parameters(typical_p=0)
+    with pytest.raises(ValidationError):
+        Parameters(typical_p=-1)
+    with pytest.raises(ValidationError):
+        Parameters(typical_p=1)
+
+
+def test_request_validation():
+    Request(inputs="test")
+
+    with pytest.raises(ValidationError):
+        Request(inputs="")
+
+    Request(inputs="test", stream=True)
+    Request(inputs="test", parameters=Parameters(best_of=2, do_sample=True))
+
+    with pytest.raises(ValidationError):
+        Request(
+            inputs="test", parameters=Parameters(best_of=2, do_sample=True), stream=True
+        )
diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py
new file mode 100644
index 00000000..46109833
--- /dev/null
+++ b/clients/python/text_generation/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "0.3.0"
+
+from text_generation.client import Client, AsyncClient
+from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
new file mode 100644
index 00000000..03bc3888
--- /dev/null
+++ b/clients/python/text_generation/client.py
@@ -0,0 +1,487 @@
+import json
+import requests
+
+from aiohttp import ClientSession, ClientTimeout
+from pydantic import ValidationError
+from typing import Dict, Optional, List, AsyncIterator, Iterator
+
+from text_generation.types import (
+    StreamResponse,
+    Response,
+    Request,
+    Parameters,
+)
+from text_generation.errors import parse_error
+
+
+class Client:
+    """Client to make calls to a text-generation-inference instance
+
+     Example:
+
+     ```python
+     >>> from text_generation import Client
+
+     >>> client = Client("https://api-inference.huggingface.co/models/bigscience/bloomz")
+     >>> client.generate("Why is the sky blue?").generated_text
+     ' Rayleigh scattering'
+
+     >>> result = ""
+     >>> for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+
+    def __init__(
+        self,
+        base_url: str,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        timeout: int = 10,
+    ):
+        """
+        Args:
+            base_url (`str`):
+                text-generation-inference instance base url
+            headers (`Optional[Dict[str, str]]`):
+                Additional headers
+            cookies (`Optional[Dict[str, str]]`):
+                Cookies to include in the requests
+            timeout (`int`):
+                Timeout in seconds
+        """
+        self.base_url = base_url
+        self.headers = headers
+        self.cookies = cookies
+        self.timeout = timeout
+
+    def generate(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+    ) -> Response:
+        """
+        Given a prompt, generate the following text
+
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+
+        Returns:
+            Response: generated response
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=best_of,
+            details=True,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+        )
+        request = Request(inputs=prompt, stream=False, parameters=parameters)
+
+        resp = requests.post(
+            self.base_url,
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+        )
+        payload = resp.json()
+        if resp.status_code != 200:
+            raise parse_error(resp.status_code, payload)
+        return Response(**payload[0])
+
+    def generate_stream(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+    ) -> Iterator[StreamResponse]:
+        """
+        Given a prompt, generate the following stream of tokens
+
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+
+        Returns:
+            Iterator[StreamResponse]: stream of generated tokens
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=best_of,
+            details=True,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+        )
+        request = Request(inputs=prompt, stream=True, parameters=parameters)
+
+        resp = requests.post(
+            self.base_url,
+            json=request.dict(),
+            headers=self.headers,
+            cookies=self.cookies,
+            timeout=self.timeout,
+            stream=True,
+        )
+
+        if resp.status_code != 200:
+            raise parse_error(resp.status_code, resp.json())
+
+        # Parse ServerSentEvents
+        for byte_payload in resp.iter_lines():
+            # Skip line
+            if byte_payload == b"\n":
+                continue
+
+            payload = byte_payload.decode("utf-8")
+
+            # Event data
+            if payload.startswith("data:"):
+                # Decode payload
+                json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+                # Parse payload
+                try:
+                    response = StreamResponse(**json_payload)
+                except ValidationError:
+                    # If we failed to parse the payload, then it is an error payload
+                    raise parse_error(resp.status_code, json_payload)
+                yield response
+
+
+class AsyncClient:
+    """Asynchronous Client to make calls to a text-generation-inference instance
+
+     Example:
+
+     ```python
+     >>> from text_generation import AsyncClient
+
+     >>> client = AsyncClient("https://api-inference.huggingface.co/models/bigscience/bloomz")
+     >>> response = await client.generate("Why is the sky blue?")
+     >>> response.generated_text
+     ' Rayleigh scattering'
+
+     >>> result = ""
+     >>> async for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+
+    def __init__(
+        self,
+        base_url: str,
+        headers: Optional[Dict[str, str]] = None,
+        cookies: Optional[Dict[str, str]] = None,
+        timeout: int = 10,
+    ):
+        """
+        Args:
+            base_url (`str`):
+                text-generation-inference instance base url
+            headers (`Optional[Dict[str, str]]`):
+                Additional headers
+            cookies (`Optional[Dict[str, str]]`):
+                Cookies to include in the requests
+            timeout (`int`):
+                Timeout in seconds
+        """
+        self.base_url = base_url
+        self.headers = headers
+        self.cookies = cookies
+        self.timeout = ClientTimeout(timeout * 60)
+
+    async def generate(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+    ) -> Response:
+        """
+        Given a prompt, generate the following text asynchronously
+
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+
+        Returns:
+            Response: generated response
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=best_of,
+            details=True,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+        )
+        request = Request(inputs=prompt, stream=False, parameters=parameters)
+
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(self.base_url, json=request.dict()) as resp:
+                payload = await resp.json()
+
+                if resp.status != 200:
+                    raise parse_error(resp.status, payload)
+                return Response(**payload[0])
+
+    async def generate_stream(
+        self,
+        prompt: str,
+        do_sample: bool = False,
+        max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        return_full_text: bool = False,
+        seed: Optional[int] = None,
+        stop_sequences: Optional[List[str]] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
+        watermark: bool = False,
+    ) -> AsyncIterator[StreamResponse]:
+        """
+        Given a prompt, generate the following stream of tokens asynchronously
+
+        Args:
+            prompt (`str`):
+                Input text
+            do_sample (`bool`):
+                Activate logits sampling
+            max_new_tokens (`int`):
+                Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
+            repetition_penalty (`float`):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this
+                paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            return_full_text (`bool`):
+                Whether to prepend the prompt to the generated text
+            seed (`int`):
+                Random sampling seed
+            stop_sequences (`List[str]`):
+                Stop generating tokens if a member of `stop_sequences` is generated
+            temperature (`float`):
+                The value used to module the logits distribution.
+            top_k (`int`):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (`float`):
+                If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+                higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+            watermark (`bool`):
+                Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+
+        Returns:
+            AsyncIterator[StreamResponse]: stream of generated tokens
+        """
+        # Validate parameters
+        parameters = Parameters(
+            best_of=best_of,
+            details=True,
+            do_sample=do_sample,
+            max_new_tokens=max_new_tokens,
+            repetition_penalty=repetition_penalty,
+            return_full_text=return_full_text,
+            seed=seed,
+            stop=stop_sequences if stop_sequences is not None else [],
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
+            watermark=watermark,
+        )
+        request = Request(inputs=prompt, stream=True, parameters=parameters)
+
+        async with ClientSession(
+            headers=self.headers, cookies=self.cookies, timeout=self.timeout
+        ) as session:
+            async with session.post(self.base_url, json=request.dict()) as resp:
+
+                if resp.status != 200:
+                    raise parse_error(resp.status, await resp.json())
+
+                # Parse ServerSentEvents
+                async for byte_payload in resp.content:
+                    # Skip line
+                    if byte_payload == b"\n":
+                        continue
+
+                    payload = byte_payload.decode("utf-8")
+
+                    # Event data
+                    if payload.startswith("data:"):
+                        # Decode payload
+                        json_payload = json.loads(payload.lstrip("data:").rstrip("/n"))
+                        # Parse payload
+                        try:
+                            response = StreamResponse(**json_payload)
+                        except ValidationError:
+                            # If we failed to parse the payload, then it is an error payload
+                            raise parse_error(resp.status, json_payload)
+                        yield response
diff --git a/clients/python/text_generation/errors.py b/clients/python/text_generation/errors.py
new file mode 100644
index 00000000..dbf0b761
--- /dev/null
+++ b/clients/python/text_generation/errors.py
@@ -0,0 +1,106 @@
+from typing import Dict
+
+
+# Text Generation Inference Errors
+class ValidationError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class GenerationError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class OverloadedError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class IncompleteGenerationError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+# API Inference Errors
+class BadRequestError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class ShardNotReadyError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class ShardTimeoutError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class NotFoundError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class RateLimitExceededError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class NotSupportedError(Exception):
+    def __init__(self, model_id: str):
+        message = (
+            f"Model `{model_id}` is not available for inference with this client. \n"
+            "Use `huggingface_hub.inference_api.InferenceApi` instead."
+        )
+        super(NotSupportedError, self).__init__(message)
+
+
+# Unknown error
+class UnknownError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+def parse_error(status_code: int, payload: Dict[str, str]) -> Exception:
+    """
+    Parse error given an HTTP status code and a json payload
+
+    Args:
+        status_code (`int`):
+            HTTP status code
+        payload (`Dict[str, str]`):
+            Json payload
+
+    Returns:
+        Exception: parsed exception
+
+    """
+    # Try to parse a Text Generation Inference error
+    message = payload["error"]
+    if "error_type" in payload:
+        error_type = payload["error_type"]
+        if error_type == "generation":
+            return GenerationError(message)
+        if error_type == "incomplete_generation":
+            return IncompleteGenerationError(message)
+        if error_type == "overloaded":
+            return OverloadedError(message)
+        if error_type == "validation":
+            return ValidationError(message)
+
+    # Try to parse a APIInference error
+    if status_code == 400:
+        return BadRequestError(message)
+    if status_code == 403 or status_code == 424:
+        return ShardNotReadyError(message)
+    if status_code == 504:
+        return ShardTimeoutError(message)
+    if status_code == 404:
+        return NotFoundError(message)
+    if status_code == 429:
+        return RateLimitExceededError(message)
+
+    # Fallback to an unknown error
+    return UnknownError(message)
diff --git a/clients/python/text_generation/inference_api.py b/clients/python/text_generation/inference_api.py
new file mode 100644
index 00000000..eb70b3d1
--- /dev/null
+++ b/clients/python/text_generation/inference_api.py
@@ -0,0 +1,154 @@
+import os
+import requests
+import base64
+import json
+import warnings
+
+from typing import List, Optional
+from huggingface_hub.utils import build_hf_headers
+
+from text_generation import Client, AsyncClient, __version__
+from text_generation.errors import NotSupportedError
+
+INFERENCE_ENDPOINT = os.environ.get(
+    "HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co"
+)
+
+SUPPORTED_MODELS = None
+
+
+def get_supported_models() -> Optional[List[str]]:
+    """
+    Get the list of supported text-generation models from GitHub
+
+    Returns:
+        Optional[List[str]]: supported models list or None if unable to get the list from GitHub
+    """
+    global SUPPORTED_MODELS
+    if SUPPORTED_MODELS is not None:
+        return SUPPORTED_MODELS
+
+    response = requests.get(
+        "https://api.github.com/repos/huggingface/text-generation-inference/contents/supported_models.json",
+        timeout=5,
+    )
+    if response.status_code == 200:
+        file_content = response.json()["content"]
+        SUPPORTED_MODELS = json.loads(base64.b64decode(file_content).decode("utf-8"))
+        return SUPPORTED_MODELS
+
+    warnings.warn("Could not retrieve list of supported models.")
+    return None
+
+
+class InferenceAPIClient(Client):
+    """Client to make calls to the HuggingFace Inference API.
+
+     Only supports a subset of the available text-generation or text2text-generation models that are served using
+     text-generation-inference
+
+     Example:
+
+     ```python
+     >>> from text_generation import InferenceAPIClient
+
+     >>> client = InferenceAPIClient("bigscience/bloomz")
+     >>> client.generate("Why is the sky blue?").generated_text
+     ' Rayleigh scattering'
+
+     >>> result = ""
+     >>> for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+
+    def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
+        """
+        Init headers and API information
+
+        Args:
+            repo_id (`str`):
+                Id of repository (e.g. `bigscience/bloom`).
+            token (`str`, `optional`):
+                The API token to use as HTTP bearer authorization. This is not
+                the authentication token. You can find the token in
+                https://huggingface.co/settings/token. Alternatively, you can
+                find both your organizations and personal API tokens using
+                `HfApi().whoami(token)`.
+            timeout (`int`):
+                Timeout in seconds
+        """
+
+        # Text Generation Inference client only supports a subset of the available hub models
+        supported_models = get_supported_models()
+        if supported_models is not None and repo_id not in supported_models:
+            raise NotSupportedError(repo_id)
+
+        headers = build_hf_headers(
+            token=token, library_name="text-generation", library_version=__version__
+        )
+        base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"
+
+        super(InferenceAPIClient, self).__init__(
+            base_url, headers=headers, timeout=timeout
+        )
+
+
+class InferenceAPIAsyncClient(AsyncClient):
+    """Aynschronous Client to make calls to the HuggingFace Inference API.
+
+     Only supports a subset of the available text-generation or text2text-generation models that are served using
+     text-generation-inference
+
+     Example:
+
+     ```python
+     >>> from text_generation import InferenceAPIAsyncClient
+
+     >>> client = InferenceAPIAsyncClient("bigscience/bloomz")
+     >>> response = await client.generate("Why is the sky blue?")
+     >>> response.generated_text
+     ' Rayleigh scattering'
+
+     >>> result = ""
+     >>> async for response in client.generate_stream("Why is the sky blue?"):
+     >>>     if not response.token.special:
+     >>>         result += response.token.text
+     >>> result
+    ' Rayleigh scattering'
+     ```
+    """
+
+    def __init__(self, repo_id: str, token: Optional[str] = None, timeout: int = 10):
+        """
+        Init headers and API information
+
+        Args:
+            repo_id (`str`):
+                Id of repository (e.g. `bigscience/bloom`).
+            token (`str`, `optional`):
+                The API token to use as HTTP bearer authorization. This is not
+                the authentication token. You can find the token in
+                https://huggingface.co/settings/token. Alternatively, you can
+                find both your organizations and personal API tokens using
+                `HfApi().whoami(token)`.
+            timeout (`int`):
+                Timeout in seconds
+        """
+
+        # Text Generation Inference client only supports a subset of the available hub models
+        supported_models = get_supported_models()
+        if supported_models is not None and repo_id not in supported_models:
+            raise NotSupportedError(repo_id)
+
+        headers = build_hf_headers(
+            token=token, library_name="text-generation", library_version=__version__
+        )
+        base_url = f"{INFERENCE_ENDPOINT}/models/{repo_id}"
+
+        super(InferenceAPIAsyncClient, self).__init__(
+            base_url, headers=headers, timeout=timeout
+        )
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
new file mode 100644
index 00000000..ea2070b8
--- /dev/null
+++ b/clients/python/text_generation/types.py
@@ -0,0 +1,223 @@
+from enum import Enum
+from pydantic import BaseModel, validator
+from typing import Optional, List
+
+from text_generation.errors import ValidationError
+
+
+class Parameters(BaseModel):
+    # Activate logits sampling
+    do_sample: bool = False
+    # Maximum number of generated tokens
+    max_new_tokens: int = 20
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+    repetition_penalty: Optional[float] = None
+    # Whether to prepend the prompt to the generated text
+    return_full_text: bool = False
+    # Stop generating tokens if a member of `stop_sequences` is generated
+    stop: List[str] = []
+    # Random sampling seed
+    seed: Optional[int]
+    # The value used to module the logits distribution.
+    temperature: Optional[float]
+    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    top_k: Optional[int]
+    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+    # higher are kept for generation.
+    top_p: Optional[float]
+    # truncate inputs tokens to the given size
+    truncate: Optional[int]
+    # Typical Decoding mass
+    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+    typical_p: Optional[float]
+    # Generate best_of sequences and return the one if the highest token logprobs
+    best_of: Optional[int]
+    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
+    watermark: bool = False
+    # Get generation details
+    details: bool = False
+
+    @validator("best_of")
+    def valid_best_of(cls, field_value, values):
+        if field_value is not None:
+            if field_value <= 0:
+                raise ValidationError("`best_of` must be strictly positive")
+            sampling = (
+                values["do_sample"]
+                | (values["temperature"] is not None)
+                | (values["top_k"] is not None)
+                | (values["top_p"] is not None)
+                | (values["typical_p"] is not None)
+            )
+            if field_value > 1 and not sampling:
+                raise ValidationError("you must use sampling when `best_of` is > 1")
+
+        return field_value
+
+    @validator("repetition_penalty")
+    def valid_repetition_penalty(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`repetition_penalty` must be strictly positive")
+        return v
+
+    @validator("seed")
+    def valid_seed(cls, v):
+        if v is not None and v < 0:
+            raise ValidationError("`seed` must be positive")
+        return v
+
+    @validator("temperature")
+    def valid_temp(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`temperature` must be strictly positive")
+        return v
+
+    @validator("top_k")
+    def valid_top_k(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`top_k` must be strictly positive")
+        return v
+
+    @validator("top_p")
+    def valid_top_p(cls, v):
+        if v is not None and (v <= 0 or v >= 1.0):
+            raise ValidationError("`top_p` must be > 0.0 and < 1.0")
+        return v
+
+    @validator("truncate")
+    def valid_truncate(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`truncate` must be strictly positive")
+        return v
+
+    @validator("typical_p")
+    def valid_typical_p(cls, v):
+        if v is not None and (v <= 0 or v >= 1.0):
+            raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
+        return v
+
+
+class Request(BaseModel):
+    # Prompt
+    inputs: str
+    # Generation parameters
+    parameters: Optional[Parameters]
+    # Whether to stream output tokens
+    stream: bool = False
+
+    @validator("inputs")
+    def valid_input(cls, v):
+        if not v:
+            raise ValidationError("`inputs` cannot be empty")
+        return v
+
+    @validator("stream")
+    def valid_best_of_stream(cls, field_value, values):
+        parameters = values["parameters"]
+        if (
+            parameters is not None
+            and parameters.best_of is not None
+            and parameters.best_of > 1
+            and field_value
+        ):
+            raise ValidationError(
+                "`best_of` != 1 is not supported when `stream` == True"
+            )
+        return field_value
+
+
+# Prompt tokens
+class PrefillToken(BaseModel):
+    # Token ID from the model tokenizer
+    id: int
+    # Token text
+    text: str
+    # Logprob
+    # Optional since the logprob of the first token cannot be computed
+    logprob: Optional[float]
+
+
+# Generated tokens
+class Token(BaseModel):
+    # Token ID from the model tokenizer
+    id: int
+    # Token text
+    text: str
+    # Logprob
+    logprob: float
+    # Is the token a special token
+    # Can be used to ignore tokens when concatenating
+    special: bool
+
+
+# Generation finish reason
+class FinishReason(Enum):
+    # number of generated tokens == `max_new_tokens`
+    Length = "length"
+    # the model generated its end of sequence token
+    EndOfSequenceToken = "eos_token"
+    # the model generated a text included in `stop_sequences`
+    StopSequence = "stop_sequence"
+
+
+# Additional sequences when using the `best_of` parameter
+class BestOfSequence(BaseModel):
+    # Generated text
+    generated_text: str
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+    # Prompt tokens
+    prefill: List[PrefillToken]
+    # Generated tokens
+    tokens: List[Token]
+
+
+# `generate` details
+class Details(BaseModel):
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+    # Prompt tokens
+    prefill: List[PrefillToken]
+    # Generated tokens
+    tokens: List[Token]
+    # Additional sequences when using the `best_of` parameter
+    best_of_sequences: Optional[List[BestOfSequence]]
+
+
+# `generate` return value
+class Response(BaseModel):
+    # Generated text
+    generated_text: str
+    # Generation details
+    details: Details
+
+
+# `generate_stream` details
+class StreamDetails(BaseModel):
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+
+
+# `generate_stream` return value
+class StreamResponse(BaseModel):
+    # Generated token
+    token: Token
+    # Complete generated text
+    # Only available when the generation is finished
+    generated_text: Optional[str]
+    # Generation details
+    # Only available when the generation is finished
+    details: Optional[StreamDetails]
diff --git a/docs/openapi.json b/docs/openapi.json
index b4ef3ba6..a2230a07 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -11,7 +11,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "0.2.1"
+    "version": "0.4.0"
   },
   "paths": {
     "/generate": {
@@ -38,10 +38,7 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/GenerateResponse"
-                  }
+                  "$ref": "#/components/schemas/GenerateResponse"
                 }
               }
             }
@@ -51,10 +48,7 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Input validation error"
@@ -67,10 +61,7 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Request failed during generation"
@@ -83,10 +74,7 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Model is overloaded"
@@ -99,10 +87,7 @@
             "content": {
               "application/json": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Incomplete generation"
@@ -136,12 +121,9 @@
           "200": {
             "description": "Generated Text",
             "content": {
-              "text/event-stream ": {
+              "text/event-stream": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/StreamResponse"
-                  }
+                  "$ref": "#/components/schemas/StreamResponse"
                 }
               }
             }
@@ -149,12 +131,9 @@
           "422": {
             "description": "Input validation error",
             "content": {
-              "text/event-stream ": {
+              "text/event-stream": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Input validation error"
@@ -165,12 +144,9 @@
           "424": {
             "description": "Generation Error",
             "content": {
-              "text/event-stream ": {
+              "text/event-stream": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Request failed during generation"
@@ -181,12 +157,9 @@
           "429": {
             "description": "Model is overloaded",
             "content": {
-              "text/event-stream ": {
+              "text/event-stream": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Model is overloaded"
@@ -197,12 +170,9 @@
           "500": {
             "description": "Incomplete generation",
             "content": {
-              "text/event-stream ": {
+              "text/event-stream": {
                 "schema": {
-                  "type": "array",
-                  "items": {
-                    "$ref": "#/components/schemas/ErrorResponse"
-                  }
+                  "$ref": "#/components/schemas/ErrorResponse"
                 },
                 "example": {
                   "error": "Incomplete generation"
@@ -213,17 +183,90 @@
         },
         "deprecated": false
       }
+    },
+    "/metrics": {
+      "get": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Prometheus metrics scrape endpoint",
+        "description": "Prometheus metrics scrape endpoint",
+        "operationId": "metrics",
+        "responses": {
+          "200": {
+            "description": "Prometheus Metrics",
+            "content": {
+              "text/plain": {
+                "schema": {
+                  "type": "string"
+                }
+              }
+            }
+          }
+        },
+        "deprecated": false
+      }
     }
   },
   "components": {
     "schemas": {
+      "BestOfSequence": {
+        "type": "object",
+        "required": [
+          "generated_text",
+          "finish_reason",
+          "generated_tokens",
+          "prefill",
+          "tokens"
+        ],
+        "properties": {
+          "finish_reason": {
+            "$ref": "#/components/schemas/FinishReason"
+          },
+          "generated_text": {
+            "type": "string",
+            "example": "test"
+          },
+          "generated_tokens": {
+            "type": "integer",
+            "format": "int32",
+            "example": 1
+          },
+          "prefill": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/PrefillToken"
+            }
+          },
+          "seed": {
+            "type": "integer",
+            "format": "int64",
+            "example": 42,
+            "nullable": true
+          },
+          "tokens": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/Token"
+            }
+          }
+        }
+      },
       "Details": {
         "type": "object",
         "required": [
           "finish_reason",
-          "generated_tokens"
+          "generated_tokens",
+          "prefill",
+          "tokens"
         ],
         "properties": {
+          "best_of_sequences": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/BestOfSequence"
+            }
+          },
           "finish_reason": {
             "$ref": "#/components/schemas/FinishReason"
           },
@@ -235,13 +278,14 @@
           "prefill": {
             "type": "array",
             "items": {
-              "$ref": "#/components/schemas/Token"
+              "$ref": "#/components/schemas/PrefillToken"
             }
           },
           "seed": {
             "type": "integer",
             "format": "int64",
-            "example": 42
+            "example": 42,
+            "nullable": true
           },
           "tokens": {
             "type": "array",
@@ -254,11 +298,15 @@
       "ErrorResponse": {
         "type": "object",
         "required": [
-          "error"
+          "error",
+          "error_type"
         ],
         "properties": {
           "error": {
             "type": "string"
+          },
+          "error_type": {
+            "type": "string"
           }
         }
       },
@@ -273,6 +321,13 @@
       "GenerateParameters": {
         "type": "object",
         "properties": {
+          "best_of": {
+            "type": "integer",
+            "default": "null",
+            "example": 1,
+            "nullable": true,
+            "exclusiveMinimum": 0.0
+          },
           "details": {
             "type": "boolean",
             "default": "true"
@@ -297,9 +352,19 @@
             "nullable": true,
             "exclusiveMinimum": 0.0
           },
+          "return_full_text": {
+            "type": "boolean",
+            "default": "null",
+            "example": false,
+            "nullable": true
+          },
           "seed": {
             "type": "integer",
-            "format": "int64"
+            "format": "int64",
+            "default": "null",
+            "example": "null",
+            "nullable": true,
+            "exclusiveMinimum": 0.0
           },
           "stop": {
             "type": "array",
@@ -335,6 +400,26 @@
             "nullable": true,
             "maximum": 1.0,
             "exclusiveMinimum": 0.0
+          },
+          "truncate": {
+            "type": "integer",
+            "default": "null",
+            "example": "null",
+            "nullable": true
+          },
+          "typical_p": {
+            "type": "number",
+            "format": "float",
+            "default": "null",
+            "example": 0.95,
+            "nullable": true,
+            "maximum": 1.0,
+            "exclusiveMinimum": 0.0
+          },
+          "watermark": {
+            "type": "boolean",
+            "default": "false",
+            "example": true
           }
         }
       },
@@ -368,6 +453,31 @@
           }
         }
       },
+      "PrefillToken": {
+        "type": "object",
+        "required": [
+          "id",
+          "text",
+          "logprob"
+        ],
+        "properties": {
+          "id": {
+            "type": "integer",
+            "format": "int32",
+            "example": 0
+          },
+          "logprob": {
+            "type": "number",
+            "format": "float",
+            "example": -0.34,
+            "nullable": true
+          },
+          "text": {
+            "type": "string",
+            "example": "test"
+          }
+        }
+      },
       "StreamDetails": {
         "type": "object",
         "required": [
@@ -386,7 +496,8 @@
           "seed": {
             "type": "integer",
             "format": "int64",
-            "example": 42
+            "example": 42,
+            "nullable": true
           }
         }
       },
@@ -415,7 +526,8 @@
         "required": [
           "id",
           "text",
-          "logprob"
+          "logprob",
+          "special"
         ],
         "properties": {
           "id": {
@@ -429,6 +541,10 @@
             "example": -0.34,
             "nullable": true
           },
+          "special": {
+            "type": "boolean",
+            "example": "false"
+          },
           "text": {
             "type": "string",
             "example": "test"
diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml
index a0d2b089..49948b05 100644
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "text-generation-launcher"
-version = "0.2.1"
+version = "0.4.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 description = "Text Generation Launcher"
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 3c8d9fcc..80466fe6 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1,6 +1,7 @@
 use clap::Parser;
 use serde_json::Value;
 use std::env;
+use std::ffi::OsString;
 use std::io::{BufRead, BufReader, Read};
 use std::path::Path;
 use std::process::ExitCode;
@@ -12,7 +13,7 @@ use std::thread;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
 use std::{fs, io};
-use subprocess::{Popen, PopenConfig, PopenError, Redirection};
+use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -23,13 +24,21 @@ struct Args {
     #[clap(long, env)]
     revision: Option<String>,
     #[clap(long, env)]
+    sharded: Option<bool>,
+    #[clap(long, env)]
     num_shard: Option<usize>,
     #[clap(long, env)]
     quantize: bool,
     #[clap(default_value = "128", long, env)]
     max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
     #[clap(default_value = "1000", long, env)]
     max_input_length: usize,
+    #[clap(default_value = "1512", long, env)]
+    max_total_tokens: usize,
     #[clap(default_value = "32", long, env)]
     max_batch_size: usize,
     #[clap(default_value = "20", long, env)]
@@ -43,38 +52,112 @@ struct Args {
     #[clap(default_value = "29500", long, env)]
     master_port: usize,
     #[clap(long, env)]
+    huggingface_hub_cache: Option<String>,
+    #[clap(long, env)]
+    weights_cache_override: Option<String>,
+    #[clap(long, env)]
+    disable_custom_kernels: bool,
+    #[clap(long, env)]
     json_output: bool,
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
+    #[clap(long, env)]
+    cors_allow_origin: Vec<String>,
+    #[clap(long, env)]
+    watermark_gamma: Option<f32>,
+    #[clap(long, env)]
+    watermark_delta: Option<f32>,
 }
 
 fn main() -> ExitCode {
     // Pattern match configuration
+    let args = Args::parse();
+
+    if args.json_output {
+        tracing_subscriber::fmt().json().init();
+    } else {
+        tracing_subscriber::fmt().compact().init();
+    }
+
+    tracing::info!("{:?}", args);
+
     let Args {
         model_id,
         revision,
+        sharded,
         num_shard,
         quantize,
         max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
         max_input_length,
+        max_total_tokens,
         max_batch_size,
         max_waiting_tokens,
         port,
         shard_uds_path,
         master_addr,
         master_port,
+        huggingface_hub_cache,
+        weights_cache_override,
+        disable_custom_kernels,
         json_output,
         otlp_endpoint,
-    } = Args::parse();
+        cors_allow_origin,
+        watermark_gamma,
+        watermark_delta,
+    } = args;
 
-    if json_output {
-        tracing_subscriber::fmt().json().init();
+    // get the number of shards given `sharded` and `num_shard`
+    let num_shard = if let Some(sharded) = sharded {
+        // sharded is set
+        match sharded {
+            // sharded is set and true
+            true => {
+                match num_shard {
+                    None => {
+                        // try to default to the number of available GPUs
+                        tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES");
+                        let n_devices = num_cuda_devices()
+                            .expect("--num-shard and CUDA_VISIBLE_DEVICES are not set");
+                        if n_devices <= 1 {
+                            panic!("`sharded` is true but only found {n_devices} CUDA devices");
+                        }
+                        n_devices
+                    }
+                    Some(num_shard) => {
+                        // we can't have only one shard while sharded
+                        if num_shard <= 1 {
+                            panic!("`sharded` is true but `num_shard` <= 1");
+                        }
+                        num_shard
+                    }
+                }
+            }
+            // sharded is set and false
+            false => {
+                let num_shard = num_shard.unwrap_or(1);
+                // we can't have more than one shard while not sharded
+                if num_shard != 1 {
+                    panic!("`sharded` is false but `num_shard` != 1");
+                }
+                num_shard
+            }
+        }
     } else {
-        tracing_subscriber::fmt().compact().init();
+        match num_shard {
+            // get num_shard from CUDA_VISIBLE_DEVICES or default to a single shard
+            None => num_cuda_devices().unwrap_or(1),
+            Some(num_shard) => num_shard,
+        }
+    };
+    if num_shard < 1 {
+        panic!("`num_shard` cannot be < 1");
     }
 
-    // By default we only have one master shard
-    let num_shard = num_shard.unwrap_or(1);
+    if num_shard > 1 {
+        tracing::info!("Sharding model on {num_shard} processes");
+    }
 
     // Signal handler
     let running = Arc::new(AtomicBool::new(true));
@@ -84,6 +167,121 @@ fn main() -> ExitCode {
     })
     .expect("Error setting Ctrl-C handler");
 
+    // Check if model_id is a local model
+    let local_path = Path::new(&model_id);
+    let is_local_model = local_path.exists() && local_path.is_dir();
+
+    // Download weights for sharded models
+    if !is_local_model && weights_cache_override.is_none() && num_shard > 1 {
+        let mut download_argv = vec![
+            "text-generation-server".to_string(),
+            "download-weights".to_string(),
+            model_id.clone(),
+            "--extension".to_string(),
+            ".safetensors".to_string(),
+            "--logger-level".to_string(),
+            "INFO".to_string(),
+            "--json-output".to_string(),
+        ];
+
+        // Model optional revision
+        if let Some(ref revision) = revision {
+            download_argv.push("--revision".to_string());
+            download_argv.push(revision.to_string())
+        }
+
+        // Copy current process env
+        let mut env: Vec<(OsString, OsString)> = env::vars_os().collect();
+
+        // If huggingface_hub_cache is set, pass it to the shard
+        // Useful when running inside a docker container
+        if let Some(ref huggingface_hub_cache) = huggingface_hub_cache {
+            env.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
+        };
+
+        // Enable hf transfer for insane download speeds
+        env.push(("HF_HUB_ENABLE_HF_TRANSFER".into(), "1".into()));
+
+        // Start process
+        tracing::info!("Starting download process.");
+        let mut download_process = match Popen::create(
+            &download_argv,
+            PopenConfig {
+                stdout: Redirection::Pipe,
+                stderr: Redirection::Pipe,
+                // Needed for the shutdown procedure
+                setpgid: true,
+                env: Some(env),
+                ..Default::default()
+            },
+        ) {
+            Ok(p) => p,
+            Err(err) => {
+                if let PopenError::IoError(ref err) = err {
+                    if err.kind() == io::ErrorKind::NotFound {
+                        tracing::error!("text-generation-server not found in PATH");
+                        tracing::error!("Please install it with `make install-server`")
+                    }
+                }
+                return ExitCode::FAILURE;
+            }
+        };
+
+        // Redirect STDOUT to the console
+        let download_stdout = download_process.stdout.take().unwrap();
+        thread::spawn(move || {
+            // Enter download tracing span
+            let stdout = BufReader::new(download_stdout);
+            let _span = tracing::span!(tracing::Level::INFO, "download").entered();
+            for line in stdout.lines() {
+                // Parse loguru logs
+                if let Ok(value) = serde_json::from_str::<Value>(&line.unwrap()) {
+                    if let Some(text) = value.get("text") {
+                        // Format escaped newlines
+                        tracing::info!("{}", text.to_string().replace("\\n", ""));
+                    }
+                }
+            }
+        });
+
+        loop {
+            if let Some(status) = download_process.poll() {
+                match status {
+                    ExitStatus::Exited(exit_code) => {
+                        if exit_code == 0 {
+                            tracing::info!("Successfully downloaded weights.");
+                            break;
+                        } else {
+                            let mut err = String::new();
+                            download_process
+                                .stderr
+                                .take()
+                                .unwrap()
+                                .read_to_string(&mut err)
+                                .unwrap();
+                            tracing::error!("Download encountered an error: {err}");
+                            return ExitCode::FAILURE;
+                        }
+                    }
+                    _ => {
+                        tracing::error!("Download process exited with an unknown status.");
+                        return ExitCode::FAILURE;
+                    }
+                }
+            }
+            if !running.load(Ordering::SeqCst) {
+                download_process.terminate().unwrap();
+                tracing::info!("Waiting for download process to gracefully shutdown");
+                download_process
+                    .wait_timeout(Duration::from_secs(90))
+                    .unwrap();
+                tracing::info!("Download process terminated");
+                return ExitCode::SUCCESS;
+            }
+            sleep(Duration::from_millis(100));
+        }
+    }
+
     // Shared shutdown bool
     let shutdown = Arc::new(Mutex::new(false));
     // Shared shutdown channel
@@ -99,6 +297,8 @@ fn main() -> ExitCode {
         let revision = revision.clone();
         let uds_path = shard_uds_path.clone();
         let master_addr = master_addr.clone();
+        let huggingface_hub_cache = huggingface_hub_cache.clone();
+        let weights_cache_override = weights_cache_override.clone();
         let status_sender = status_sender.clone();
         let shutdown = shutdown.clone();
         let shutdown_sender = shutdown_sender.clone();
@@ -113,6 +313,11 @@ fn main() -> ExitCode {
                 num_shard,
                 master_addr,
                 master_port,
+                huggingface_hub_cache,
+                weights_cache_override,
+                disable_custom_kernels,
+                watermark_gamma,
+                watermark_delta,
                 otlp_endpoint,
                 status_sender,
                 shutdown,
@@ -161,8 +366,14 @@ fn main() -> ExitCode {
         "text-generation-router".to_string(),
         "--max-concurrent-requests".to_string(),
         max_concurrent_requests.to_string(),
+        "--max-best-of".to_string(),
+        max_best_of.to_string(),
+        "--max-stop-sequences".to_string(),
+        max_stop_sequences.to_string(),
         "--max-input-length".to_string(),
         max_input_length.to_string(),
+        "--max-total-tokens".to_string(),
+        max_total_tokens.to_string(),
         "--max-batch-size".to_string(),
         max_batch_size.to_string(),
         "--max-waiting-tokens".to_string(),
@@ -185,6 +396,12 @@ fn main() -> ExitCode {
         argv.push(otlp_endpoint);
     }
 
+    // CORS origins
+    for origin in cors_allow_origin.into_iter() {
+        argv.push("--cors-allow-origin".to_string());
+        argv.push(origin);
+    }
+
     let mut webserver = match Popen::create(
         &argv,
         PopenConfig {
@@ -232,7 +449,7 @@ fn main() -> ExitCode {
 
     while running.load(Ordering::SeqCst) {
         if let Ok(ShardStatus::Failed((rank, err))) = status_receiver.try_recv() {
-            tracing::error!("Shard {} failed:\n{}", rank, err);
+            tracing::error!("Shard {rank} failed:\n{err}");
             exit_code = ExitCode::FAILURE;
             break;
         };
@@ -275,6 +492,11 @@ fn shard_manager(
     world_size: usize,
     master_addr: String,
     master_port: usize,
+    huggingface_hub_cache: Option<String>,
+    weights_cache_override: Option<String>,
+    disable_custom_kernels: bool,
+    watermark_gamma: Option<f32>,
+    watermark_delta: Option<f32>,
     otlp_endpoint: Option<String>,
     status_sender: mpsc::Sender<ShardStatus>,
     shutdown: Arc<Mutex<bool>>,
@@ -319,43 +541,54 @@ fn shard_manager(
         shard_argv.push(otlp_endpoint);
     }
 
-    let mut env = vec![
-        ("RANK".into(), rank.to_string().into()),
-        ("WORLD_SIZE".into(), world_size.to_string().into()),
-        ("MASTER_ADDR".into(), master_addr.into()),
-        ("MASTER_PORT".into(), master_port.to_string().into()),
-        ("SAFETENSORS_FAST_GPU".into(), "1".into()),
-        ("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()),
-    ];
+    // Copy current process env
+    let mut env: Vec<(OsString, OsString)> = env::vars_os().collect();
 
-    // If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard
+    // Torch Distributed Env vars
+    env.push(("RANK".into(), rank.to_string().into()));
+    env.push(("WORLD_SIZE".into(), world_size.to_string().into()));
+    env.push(("MASTER_ADDR".into(), master_addr.into()));
+    env.push(("MASTER_PORT".into(), master_port.to_string().into()));
+    env.push(("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()));
+
+    // Safetensors load fast
+    env.push(("SAFETENSORS_FAST_GPU".into(), "1".into()));
+
+    // Enable hf transfer for insane download speeds
+    env.push(("HF_HUB_ENABLE_HF_TRANSFER".into(), "1".into()));
+
+    // If huggingface_hub_cache is some, pass it to the shard
     // Useful when running inside a docker container
-    if let Ok(huggingface_hub_cache) = env::var("HUGGINGFACE_HUB_CACHE") {
+    if let Some(huggingface_hub_cache) = huggingface_hub_cache {
         env.push(("HUGGINGFACE_HUB_CACHE".into(), huggingface_hub_cache.into()));
     };
 
-    // If the WEIGHTS_CACHE_OVERRIDE env var is set, pass it to the shard
+    // If weights_cache_override is some, pass it to the shard
     // Useful when running inside a HuggingFace Inference Endpoint
-    if let Ok(weights_cache_override) = env::var("WEIGHTS_CACHE_OVERRIDE") {
+    if let Some(weights_cache_override) = weights_cache_override {
         env.push((
             "WEIGHTS_CACHE_OVERRIDE".into(),
             weights_cache_override.into(),
         ));
     };
 
-    // If the NCCL_SHM_DISABLE env var is set, pass it to the shard
-    // needed when running NCCL inside a docker container and when you can't increase shm size
-    if let Ok(nccl_shm_disalbe) = env::var("NCCL_SHM_DISABLE") {
-        env.push(("NCCL_SHM_DISABLE".into(), nccl_shm_disalbe.into()));
-    };
+    // If disable_custom_kernels is true, pass it to the shard as an env var
+    if disable_custom_kernels {
+        env.push(("DISABLE_CUSTOM_KERNELS".into(), "True".into()))
+    }
 
-    // If the CUDA_VISIBLE_DEVICES env var is set, pass it to the shard
-    if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {
-        env.push(("CUDA_VISIBLE_DEVICES".into(), cuda_visible_devices.into()));
-    };
+    // Watermark Gamma
+    if let Some(watermark_gamma) = watermark_gamma {
+        env.push(("WATERMARK_GAMMA".into(), watermark_gamma.to_string().into()))
+    }
+
+    // Watermark Delta
+    if let Some(watermark_delta) = watermark_delta {
+        env.push(("WATERMARK_DELTA".into(), watermark_delta.to_string().into()))
+    }
 
     // Start process
-    tracing::info!("Starting shard {}", rank);
+    tracing::info!("Starting shard {rank}");
     let mut p = match Popen::create(
         &shard_argv,
         PopenConfig {
@@ -419,17 +652,17 @@ fn shard_manager(
         if *shutdown.lock().unwrap() {
             p.terminate().unwrap();
             let _ = p.wait_timeout(Duration::from_secs(90));
-            tracing::info!("Shard {} terminated", rank);
+            tracing::info!("Shard {rank} terminated");
             return;
         }
 
         // Shard is ready
         if uds.exists() && !ready {
-            tracing::info!("Shard {} ready in {:?}", rank, start_time.elapsed());
+            tracing::info!("Shard {rank} ready in {:?}", start_time.elapsed());
             status_sender.send(ShardStatus::Ready).unwrap();
             ready = true;
         } else if !ready && wait_time.elapsed() > Duration::from_secs(10) {
-            tracing::info!("Waiting for shard {} to be ready...", rank);
+            tracing::info!("Waiting for shard {rank} to be ready...");
             wait_time = Instant::now();
         }
         sleep(Duration::from_millis(100));
@@ -449,3 +682,11 @@ fn shutdown_shards(shutdown: Arc<Mutex<bool>>, shutdown_receiver: &mpsc::Receive
     // This will block till all shutdown_sender are dropped
     let _ = shutdown_receiver.recv();
 }
+
+fn num_cuda_devices() -> Option<usize> {
+    if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") {
+        let n_devices = cuda_visible_devices.split(',').count();
+        return Some(n_devices);
+    }
+    None
+}
diff --git a/launcher/tests/bloom_560m.json b/launcher/tests/bloom_560m.json
index 17e2571e..96f89f6b 100644
--- a/launcher/tests/bloom_560m.json
+++ b/launcher/tests/bloom_560m.json
@@ -1,122 +1,142 @@
 {
+  "generated_text": ".get(\"action\");\n        if (action == null) {\n            throw new RuntimeException",
   "details": {
     "finish_reason": "length",
     "generated_tokens": 20,
+    "seed": null,
     "prefill": [
       {
         "id": 10264,
-        "logprob": null,
-        "text": "Test"
+        "text": "Test",
+        "logprob": null
       },
       {
         "id": 8821,
-        "logprob": -11.894989,
-        "text": " request"
+        "text": " request",
+        "logprob": -11.894989
       }
     ],
-    "seed": null,
     "tokens": [
       {
         "id": 17,
+        "text": ".",
         "logprob": -1.8267672,
-        "text": "."
+        "special": false
       },
       {
         "id": 1587,
+        "text": "get",
         "logprob": -2.4674969,
-        "text": "get"
+        "special": false
       },
       {
         "id": 11,
+        "text": "(",
         "logprob": -1.906001,
-        "text": "("
+        "special": false
       },
       {
         "id": 5,
+        "text": "\"",
         "logprob": -1.2279545,
-        "text": "\""
+        "special": false
       },
       {
         "id": 4899,
+        "text": "action",
         "logprob": -4.170299,
-        "text": "action"
+        "special": false
       },
       {
         "id": 5,
+        "text": "\"",
         "logprob": -0.32478866,
-        "text": "\""
+        "special": false
       },
       {
         "id": 12,
+        "text": ")",
         "logprob": -1.0773665,
-        "text": ")"
+        "special": false
       },
       {
         "id": 30,
+        "text": ";",
         "logprob": -0.27640742,
-        "text": ";"
+        "special": false
       },
       {
         "id": 837,
+        "text": "\n       ",
         "logprob": -1.6970354,
-        "text": "\n       "
+        "special": false
       },
       {
         "id": 1320,
+        "text": " if",
         "logprob": -1.4495516,
-        "text": " if"
+        "special": false
       },
       {
         "id": 375,
+        "text": " (",
         "logprob": -0.23609057,
-        "text": " ("
+        "special": false
       },
       {
         "id": 4899,
+        "text": "action",
         "logprob": -1.1916996,
-        "text": "action"
+        "special": false
       },
       {
         "id": 3535,
+        "text": " ==",
         "logprob": -0.8918753,
-        "text": " =="
+        "special": false
       },
       {
         "id": 5109,
+        "text": " null",
         "logprob": -0.3933342,
-        "text": " null"
+        "special": false
       },
       {
         "id": 12,
+        "text": ")",
         "logprob": -0.43212673,
-        "text": ")"
+        "special": false
       },
       {
         "id": 731,
+        "text": " {",
         "logprob": -0.17702064,
-        "text": " {"
+        "special": false
       },
       {
         "id": 1260,
+        "text": "\n           ",
         "logprob": -0.07027565,
-        "text": "\n           "
+        "special": false
       },
       {
         "id": 10519,
+        "text": " throw",
         "logprob": -1.3915029,
-        "text": " throw"
+        "special": false
       },
       {
         "id": 2084,
+        "text": " new",
         "logprob": -0.04201372,
-        "text": " new"
+        "special": false
       },
       {
         "id": 150858,
+        "text": " RuntimeException",
         "logprob": -1.7329919,
-        "text": " RuntimeException"
+        "special": false
       }
     ]
-  },
-  "generated_text": ".get(\"action\");\n        if (action == null) {\n            throw new RuntimeException"
+  }
 }
\ No newline at end of file
diff --git a/launcher/tests/integration_tests.rs b/launcher/tests/integration_tests.rs
index b70b1628..0d2b6c74 100644
--- a/launcher/tests/integration_tests.rs
+++ b/launcher/tests/integration_tests.rs
@@ -14,6 +14,7 @@ pub struct Token {
     id: u32,
     text: String,
     logprob: Option<f32>,
+    special: bool,
 }
 
 #[derive(Deserialize)]
@@ -136,6 +137,7 @@ fn compare_results(result: GeneratedText, expected: GeneratedText) {
     {
         assert_eq!(token.id, expected_token.id);
         assert_eq!(token.text, expected_token.text);
+        assert_eq!(token.special, expected_token.special);
         if let Some(logprob) = token.logprob {
             let expected_logprob = expected_token.logprob.unwrap();
             assert_float_eq!(logprob, expected_logprob, abs <= 0.001);
diff --git a/launcher/tests/mt0_base.json b/launcher/tests/mt0_base.json
index cee3bc47..22c9499f 100644
--- a/launcher/tests/mt0_base.json
+++ b/launcher/tests/mt0_base.json
@@ -1,117 +1,137 @@
 {
+  "generated_text": "\"\"\"Test the contents of the contents of the contents. \"\"\" test_test",
   "details": {
     "finish_reason": "length",
     "generated_tokens": 20,
+    "seed": null,
     "prefill": [
       {
         "id": 0,
-        "logprob": null,
-        "text": "<pad>"
+        "text": "<pad>",
+        "logprob": null
       }
     ],
-    "seed": null,
     "tokens": [
       {
         "id": 259,
+        "text": " ",
         "logprob": -1.3656927,
-        "text": ""
+        "special": false
       },
       {
         "id": 215100,
+        "text": "\"\"\"",
         "logprob": -2.6551573,
-        "text": "\"\"\""
+        "special": false
       },
       {
         "id": 46138,
+        "text": "Test",
         "logprob": -1.8059857,
-        "text": "Test"
+        "special": false
       },
       {
         "id": 287,
+        "text": " the",
         "logprob": -1.2102449,
-        "text": "the"
+        "special": false
       },
       {
         "id": 259,
+        "text": " ",
         "logprob": -1.6057279,
-        "text": ""
+        "special": false
       },
       {
         "id": 49076,
+        "text": "contents",
         "logprob": -3.6060903,
-        "text": "contents"
+        "special": false
       },
       {
         "id": 304,
+        "text": " of",
         "logprob": -0.5270343,
-        "text": "of"
+        "special": false
       },
       {
         "id": 287,
+        "text": " the",
         "logprob": -0.62522805,
-        "text": "the"
+        "special": false
       },
       {
         "id": 259,
+        "text": " ",
         "logprob": -1.4069618,
-        "text": ""
+        "special": false
       },
       {
         "id": 49076,
+        "text": "contents",
         "logprob": -2.621994,
-        "text": "contents"
+        "special": false
       },
       {
         "id": 304,
+        "text": " of",
         "logprob": -1.3172221,
-        "text": "of"
+        "special": false
       },
       {
         "id": 287,
+        "text": " the",
         "logprob": -0.3501925,
-        "text": "the"
+        "special": false
       },
       {
         "id": 259,
+        "text": " ",
         "logprob": -0.7219573,
-        "text": ""
+        "special": false
       },
       {
         "id": 49076,
+        "text": "contents",
         "logprob": -1.0494149,
-        "text": "contents"
+        "special": false
       },
       {
         "id": 260,
+        "text": ".",
         "logprob": -1.0803378,
-        "text": "."
+        "special": false
       },
       {
         "id": 259,
+        "text": " ",
         "logprob": -0.32933083,
-        "text": ""
+        "special": false
       },
       {
         "id": 215100,
+        "text": "\"\"\"",
         "logprob": -0.11268901,
-        "text": "\"\"\""
+        "special": false
       },
       {
         "id": 2978,
+        "text": " test",
         "logprob": -1.5846587,
-        "text": "test"
+        "special": false
       },
       {
         "id": 290,
+        "text": "_",
         "logprob": -0.49796978,
-        "text": "_"
+        "special": false
       },
       {
         "id": 4125,
+        "text": "test",
         "logprob": -2.0026445,
-        "text": "test"
+        "special": false
       }
     ]
-  },
-  "generated_text": "\"\"\"Test the contents of the contents of the contents. \"\"\" test_test"
+  }
 }
\ No newline at end of file
diff --git a/proto/generate.proto b/proto/generate.proto
index 0c4f9626..5081ce1c 100644
--- a/proto/generate.proto
+++ b/proto/generate.proto
@@ -34,12 +34,16 @@ message NextTokenChooserParameters {
     uint32 top_k = 2;
     /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
     float top_p = 3;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float typical_p = 4;
     /// apply sampling on the logits
-    bool do_sample = 4;
+    bool do_sample = 5;
     /// random seed for sampling
-    uint64 seed = 5;
+    uint64 seed = 6;
     /// repetition penalty
-    float repetition_penalty = 6;
+    float repetition_penalty = 7;
+    /// token watermarking using "A Watermark for Large Language Models"
+    bool watermark = 8;
 }
 
 message StoppingCriteriaParameters {
@@ -54,12 +58,10 @@ message Request {
     uint64 id = 1;
     /// The generation context
     string inputs = 2;
-    /// The number of tokens inside inputs
-    uint32 input_length = 3;
     /// Next Token Chooser Parameters
-    NextTokenChooserParameters parameters = 4;
+    NextTokenChooserParameters parameters = 3;
     /// Stopping Criteria Parameters
-    StoppingCriteriaParameters stopping_parameters = 5;
+    StoppingCriteriaParameters stopping_parameters = 4;
 }
 
 message Batch {
@@ -108,8 +110,10 @@ message Generation {
     float token_logprob = 4;
     /// Text
     string token_text = 5;
+    /// Is it a special token
+    bool token_is_special = 6;
     /// Complete generated text
-    GeneratedText generated_text = 6;
+    GeneratedText generated_text = 7;
 }
 
 message PrefillRequest {
diff --git a/router/Cargo.toml b/router/Cargo.toml
index f1ace790..292e6219 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "text-generation-router"
-version = "0.2.1"
+version = "0.4.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 description = "Text Generation Webserver"
@@ -19,17 +19,21 @@ axum-tracing-opentelemetry = "0.9.0"
 text-generation-client = { path = "client" }
 clap = { version = "4.1.4", features = ["derive", "env"] }
 futures = "0.3.26"
+metrics = "0.20.1"
+metrics-exporter-prometheus = { version = "0.11.0", features = [] }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.18.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.11.0"
 parking_lot = "0.12.1"
 rand = "0.8.5"
+reqwest = { version  = "0.11.14", features = [] }
 serde = "1.0.152"
 serde_json = "1.0.93"
 thiserror = "1.0.38"
 tokenizers = "0.13.2"
 tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.11"
+tower-http = { version = "0.3.5", features = ["cors"] }
 tracing = "0.1.37"
 tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
diff --git a/router/client/Cargo.toml b/router/client/Cargo.toml
index 7c7ed48e..fd43b63b 100644
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "text-generation-client"
-version = "0.2.1"
+version = "0.4.0"
 edition = "2021"
 
 [dependencies]
diff --git a/router/grpc-metadata/Cargo.toml b/router/grpc-metadata/Cargo.toml
index 311092e3..f8676d67 100644
--- a/router/grpc-metadata/Cargo.toml
+++ b/router/grpc-metadata/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "grpc-metadata"
-version = "0.1.0"
+version = "0.4.0"
 edition = "2021"
 
 [dependencies]
diff --git a/router/src/infer.rs b/router/src/infer.rs
index 4e368492..ae151d8a 100644
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@@ -1,9 +1,9 @@
 /// Batching and inference logic
 use crate::validation::{Validation, ValidationError};
-use crate::GenerateRequest;
 use crate::{Entry, Queue, Token};
+use crate::{GenerateRequest, PrefillToken};
+use futures::future::try_join_all;
 use nohash_hasher::IntMap;
-use std::future::Future;
 use std::sync::Arc;
 use text_generation_client::{
     Batch, ClientError, GeneratedText, Generation, PrefillTokens, ShardedClient,
@@ -81,6 +81,7 @@ impl Infer {
             .limit_concurrent_requests
             .try_acquire_owned()
             .map_err(|err| {
+                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
                 tracing::error!("{err}");
                 err
             })?;
@@ -138,7 +139,7 @@ impl Infer {
                         .into_iter()
                         .zip(tokens.logprobs.into_iter())
                         .zip(tokens.texts.into_iter())
-                        .map(|((id, logprob), text)| Token { id, text, logprob })
+                        .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
                         .collect();
                 }
                 // Push last token
@@ -172,10 +173,48 @@ impl Infer {
             })
         } else {
             let err = InferError::IncompleteGeneration;
+            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
             tracing::error!("{err}");
             Err(err)
         }
     }
+    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
+    /// the highest log probability per token
+    #[instrument(skip(self))]
+    pub(crate) async fn generate_best_of(
+        &self,
+        request: GenerateRequest,
+        best_of: usize,
+    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
+        // validate  best_of parameter separately
+        let best_of = self.validation.validate_best_of(best_of)?;
+
+        // create multiple generate requests
+        let mut infer_responses: Vec<InferResponse> =
+            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
+
+        // get the sequence with the highest log probability per token
+        let mut max_index = 0;
+        let mut max_logprob: f32 = f32::MIN;
+
+        for (i, response) in infer_responses.iter().enumerate() {
+            // mean logprobs of the generated tokens
+            let sequence_logprob = response
+                .tokens
+                .iter()
+                .map(|token| token.logprob)
+                .sum::<f32>()
+                / response.tokens.len() as f32;
+
+            // set best sequence
+            if sequence_logprob > max_logprob {
+                max_index = i;
+                max_logprob = sequence_logprob;
+            }
+        }
+        let best_response = infer_responses.remove(max_index);
+        Ok((best_response, infer_responses))
+    }
 }
 
 /// Batching logic
@@ -190,7 +229,11 @@ async fn batching_task(
     shared: Arc<Shared>,
 ) {
     // Minimum batch size after which we try to add more requests
-    let limit_min_batch_size = (max_batch_size / 2) as u32;
+    let limit_min_batch_size = if max_batch_size > 1 {
+        (max_batch_size / 2) as u32
+    } else {
+        0
+    };
 
     // Infinite loop
     loop {
@@ -201,7 +244,7 @@ async fn batching_task(
         // This batch might be smaller than the maximum batch size if there are not enough requests
         // waiting in the queue
         while let Some((mut entries, batch, span)) = queue.next_batch(None, max_batch_size).await {
-            let mut cached_batch = wrap_future(client.prefill(batch), &mut entries)
+            let mut cached_batch = prefill(&mut client, batch, &mut entries)
                 .instrument(span)
                 .await;
             let mut waiting_tokens = 1;
@@ -212,6 +255,7 @@ async fn batching_task(
                 // Get current batch info
                 let batch_size = batch.size;
                 let mut batches = vec![batch];
+                metrics::gauge!("tgi_batch_current_size", batch_size as f64);
 
                 // If the current batch is too small, we try to add more requests to it
                 if batch_size <= limit_min_batch_size {
@@ -234,17 +278,17 @@ async fn batching_task(
                             // because a new batch is being computed
                             let entry_waiting_span =
                                 info_span!(parent: &entry.span, "waiting", batch_size = new_batch_size);
-                            // Add relationship
+                            // Add relationships
+                            span.follows_from(&entry_waiting_span);
                             entry_waiting_span.follows_from(&span);
                             // Update entry
                             entry.temp_span = Some(entry_waiting_span);
                         });
 
                         // Generate one token for this new batch to have the attention past in cache
-                        let new_cached_batch =
-                            wrap_future(client.prefill(new_batch), &mut new_entries)
-                                .instrument(span)
-                                .await;
+                        let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
+                            .instrument(span)
+                            .await;
                         // Reset waiting counter
                         waiting_tokens = 1;
                         // Extend current batch with the new batch
@@ -262,35 +306,66 @@ async fn batching_task(
                     // Create a new span to link the batch back to this entry
                     let entry_batch_span =
                         info_span!(parent: &entry.span, "infer", batch_size = next_batch_size);
-                    // Add relationship
+                    // Add relationships
+                    next_batch_span.follows_from(&entry_batch_span);
                     entry_batch_span.follows_from(&next_batch_span);
                     // Update entry
                     entry.temp_span = Some(entry_batch_span);
                 });
 
-                cached_batch = wrap_future(client.decode(batches), &mut entries)
+                cached_batch = decode(&mut client, batches, &mut entries)
                     .instrument(next_batch_span)
                     .await;
                 waiting_tokens += 1;
             }
+            metrics::gauge!("tgi_batch_current_size", 0.0);
         }
     }
 }
 
-/// Wrap a future inside a match statement to handle errors and send the responses to Infer
 #[instrument(skip_all)]
-async fn wrap_future(
-    future: impl Future<Output = Result<(Vec<Generation>, Option<Batch>), ClientError>>,
+async fn prefill(
+    client: &mut ShardedClient,
+    batch: Batch,
     entries: &mut IntMap<u64, Entry>,
 ) -> Option<Batch> {
-    match future.await {
+    let start_time = Instant::now();
+
+    match client.prefill(batch).await {
         Ok((generations, next_batch)) => {
             send_generations(generations, entries);
+            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed(), "method" => "prefill");
+            metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
             next_batch
         }
         // If we have an error, we discard the whole batch
         Err(err) => {
             send_errors(err, entries);
+            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
+            None
+        }
+    }
+}
+
+#[instrument(skip_all)]
+async fn decode(
+    client: &mut ShardedClient,
+    batches: Vec<Batch>,
+    entries: &mut IntMap<u64, Entry>,
+) -> Option<Batch> {
+    let start_time = Instant::now();
+
+    match client.decode(batches).await {
+        Ok((generations, next_batch)) => {
+            send_generations(generations, entries);
+            metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed(), "method" => "decode");
+            metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
+            next_batch
+        }
+        // If we have an error, we discard the whole batch
+        Err(err) => {
+            send_errors(err, entries);
+            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
             None
         }
     }
@@ -303,6 +378,7 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
         // Create and enter a span to link this function back to the entry
         let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
         let err = InferError::GenerationError(error.to_string());
+        metrics::increment_counter!("tgi_request_failure", "err" => "generation");
         tracing::error!("{err}");
 
         // unwrap_or is valid here as we don't care if the receiver is gone.
@@ -340,6 +416,7 @@ fn send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entr
             id: generation.token_id,
             text: generation.token_text,
             logprob: generation.token_logprob,
+            special: generation.token_is_special,
         };
 
         if let Some(generated_text) = generation.generated_text {
@@ -388,7 +465,7 @@ pub(crate) enum InferStreamResponse {
 
 #[derive(Debug)]
 pub(crate) struct InferResponse {
-    pub(crate) prefill: Vec<Token>,
+    pub(crate) prefill: Vec<PrefillToken>,
     pub(crate) tokens: Vec<Token>,
     pub(crate) generated_text: GeneratedText,
     pub(crate) queued: Instant,
@@ -406,3 +483,14 @@ pub enum InferError {
     #[error("Incomplete generation")]
     IncompleteGeneration,
 }
+
+impl InferError {
+    pub(crate) fn error_type(&self) -> &str {
+        match self {
+            InferError::GenerationError(_) => "generation",
+            InferError::Overloaded(_) => "overloaded",
+            InferError::ValidationError(_) => "validation",
+            InferError::IncompleteGeneration => "incomplete_generation",
+        }
+    }
+}
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 8e3199dd..91b4417c 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -12,6 +12,9 @@ use validation::Validation;
 
 #[derive(Clone, Debug, Deserialize, ToSchema)]
 pub(crate) struct GenerateParameters {
+    #[serde(default)]
+    #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 1)]
+    pub best_of: Option<usize>,
     #[serde(default)]
     #[schema(
         exclusive_minimum = 0.0,
@@ -40,39 +43,64 @@ pub(crate) struct GenerateParameters {
         example = 0.95
     )]
     pub top_p: Option<f32>,
-    #[serde(default = "default_do_sample")]
+    #[serde(default)]
+    #[schema(
+        exclusive_minimum = 0.0,
+        maximum = 1.0,
+        nullable = true,
+        default = "null",
+        example = 0.95
+    )]
+    pub typical_p: Option<f32>,
+    #[serde(default)]
     #[schema(default = "false", example = true)]
     pub do_sample: bool,
     #[serde(default = "default_max_new_tokens")]
     #[schema(exclusive_minimum = 0, exclusive_maximum = 512, default = "20")]
     pub max_new_tokens: u32,
     #[serde(default)]
-    #[schema(inline, max_items = 4, example = json!(["photographer"]))]
+    #[schema(nullable = true, default = "null", example = false)]
+    pub return_full_text: Option<bool>,
+    #[serde(default)]
+    #[schema(inline, max_items = 4, example = json ! (["photographer"]))]
     pub stop: Vec<String>,
     #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub truncate: Option<usize>,
+    #[serde(default)]
+    #[schema(default = "false", example = true)]
+    pub watermark: bool,
+    #[serde(default)]
     #[schema(default = "true")]
     pub details: bool,
     #[serde(default)]
+    #[schema(
+        exclusive_minimum = 0,
+        nullable = true,
+        default = "null",
+        example = "null"
+    )]
     pub seed: Option<u64>,
 }
 
-fn default_do_sample() -> bool {
-    false
-}
-
 fn default_max_new_tokens() -> u32 {
     20
 }
 
 fn default_parameters() -> GenerateParameters {
     GenerateParameters {
+        best_of: None,
         temperature: None,
         repetition_penalty: None,
         top_k: None,
         top_p: None,
-        do_sample: default_do_sample(),
+        typical_p: None,
+        do_sample: false,
         max_new_tokens: default_max_new_tokens(),
-        stop: vec![],
+        return_full_text: None,
+        stop: Vec::new(),
+        truncate: None,
+        watermark: false,
         details: false,
         seed: None,
     }
@@ -86,14 +114,46 @@ pub(crate) struct GenerateRequest {
     pub parameters: GenerateParameters,
 }
 
+#[derive(Clone, Debug, Deserialize, ToSchema)]
+pub(crate) struct CompatGenerateRequest {
+    #[schema(example = "My name is Olivier and I")]
+    pub inputs: String,
+    #[serde(default = "default_parameters")]
+    pub parameters: GenerateParameters,
+    #[serde(default)]
+    #[allow(dead_code)]
+    pub stream: bool,
+}
+
+impl From<CompatGenerateRequest> for GenerateRequest {
+    fn from(req: CompatGenerateRequest) -> Self {
+        Self {
+            inputs: req.inputs,
+            parameters: req.parameters,
+        }
+    }
+}
+
+#[derive(Debug, Serialize, ToSchema)]
+pub struct PrefillToken {
+    #[schema(example = 0)]
+    id: u32,
+    #[schema(example = "test")]
+    text: String,
+    #[schema(nullable = true, example = - 0.34)]
+    logprob: f32,
+}
+
 #[derive(Debug, Serialize, ToSchema)]
 pub struct Token {
     #[schema(example = 0)]
     id: u32,
     #[schema(example = "test")]
     text: String,
-    #[schema(nullable = true, example = -0.34)]
+    #[schema(nullable = true, example = - 0.34)]
     logprob: f32,
+    #[schema(example = "false")]
+    special: bool,
 }
 
 #[derive(Serialize, ToSchema)]
@@ -108,16 +168,32 @@ pub(crate) enum FinishReason {
     StopSequence,
 }
 
+#[derive(Serialize, ToSchema)]
+pub(crate) struct BestOfSequence {
+    #[schema(example = "test")]
+    pub generated_text: String,
+    #[schema(example = "length")]
+    pub finish_reason: FinishReason,
+    #[schema(example = 1)]
+    pub generated_tokens: u32,
+    #[schema(nullable = true, example = 42)]
+    pub seed: Option<u64>,
+    pub prefill: Vec<PrefillToken>,
+    pub tokens: Vec<Token>,
+}
+
 #[derive(Serialize, ToSchema)]
 pub(crate) struct Details {
     #[schema(example = "length")]
     pub finish_reason: FinishReason,
     #[schema(example = 1)]
     pub generated_tokens: u32,
-    #[schema(example = 42)]
+    #[schema(nullable = true, example = 42)]
     pub seed: Option<u64>,
-    pub prefill: Option<Vec<Token>>,
-    pub tokens: Option<Vec<Token>>,
+    pub prefill: Vec<PrefillToken>,
+    pub tokens: Vec<Token>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub best_of_sequences: Option<Vec<BestOfSequence>>,
 }
 
 #[derive(Serialize, ToSchema)]
@@ -134,7 +210,7 @@ pub(crate) struct StreamDetails {
     pub finish_reason: FinishReason,
     #[schema(example = 1)]
     pub generated_tokens: u32,
-    #[schema(example = 42)]
+    #[schema(nullable = true, example = 42)]
     pub seed: Option<u64>,
 }
 
@@ -149,6 +225,6 @@ pub(crate) struct StreamResponse {
 
 #[derive(Serialize, ToSchema)]
 pub(crate) struct ErrorResponse {
-    #[schema(inline)]
     pub error: String,
+    pub error_type: String,
 }
diff --git a/router/src/main.rs b/router/src/main.rs
index 881d94fb..2ccf66b3 100644
--- a/router/src/main.rs
+++ b/router/src/main.rs
@@ -1,4 +1,5 @@
 /// Text Generation Inference webserver entrypoint
+use axum::http::HeaderValue;
 use clap::Parser;
 use opentelemetry::sdk::propagation::TraceContextPropagator;
 use opentelemetry::sdk::trace;
@@ -7,9 +8,11 @@ use opentelemetry::sdk::Resource;
 use opentelemetry::{global, KeyValue};
 use opentelemetry_otlp::WithExportConfig;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use std::path::Path;
 use text_generation_client::ShardedClient;
 use text_generation_router::server;
 use tokenizers::Tokenizer;
+use tower_http::cors::AllowOrigin;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 use tracing_subscriber::{EnvFilter, Layer};
@@ -20,8 +23,14 @@ use tracing_subscriber::{EnvFilter, Layer};
 struct Args {
     #[clap(default_value = "128", long, env)]
     max_concurrent_requests: usize,
+    #[clap(default_value = "2", long, env)]
+    max_best_of: usize,
+    #[clap(default_value = "4", long, env)]
+    max_stop_sequences: usize,
     #[clap(default_value = "1000", long, env)]
     max_input_length: usize,
+    #[clap(default_value = "1512", long, env)]
+    max_total_tokens: usize,
     #[clap(default_value = "32", long, env)]
     max_batch_size: usize,
     #[clap(default_value = "20", long, env)]
@@ -38,6 +47,8 @@ struct Args {
     json_output: bool,
     #[clap(long, env)]
     otlp_endpoint: Option<String>,
+    #[clap(long, env)]
+    cors_allow_origin: Option<Vec<String>>,
 }
 
 fn main() -> Result<(), std::io::Error> {
@@ -46,7 +57,10 @@ fn main() -> Result<(), std::io::Error> {
     // Pattern match configuration
     let Args {
         max_concurrent_requests,
+        max_best_of,
+        max_stop_sequences,
         max_input_length,
+        max_total_tokens,
         max_batch_size,
         max_waiting_tokens,
         port,
@@ -55,17 +69,37 @@ fn main() -> Result<(), std::io::Error> {
         validation_workers,
         json_output,
         otlp_endpoint,
+        cors_allow_origin,
     } = args;
 
     if validation_workers == 0 {
         panic!("validation_workers must be > 0");
     }
 
-    // Download and instantiate tokenizer
+    // CORS allowed origins
+    // map to go inside the option and then map to parse from String to HeaderValue
+    // Finally, convert to AllowOrigin
+    let cors_allow_origin: Option<AllowOrigin> = cors_allow_origin.map(|cors_allow_origin| {
+        AllowOrigin::list(
+            cors_allow_origin
+                .iter()
+                .map(|origin| origin.parse::<HeaderValue>().unwrap()),
+        )
+    });
+
+    // Tokenizer instance
     // This will only be used to validate payloads
-    //
-    // We need to download it outside of the Tokio runtime
-    let tokenizer = Tokenizer::from_pretrained(tokenizer_name, None).unwrap();
+    let local_path = Path::new(&tokenizer_name);
+    let tokenizer =
+        if local_path.exists() && local_path.is_dir() && local_path.join("tokenizer.json").exists()
+        {
+            // Load local tokenizer
+            Tokenizer::from_file(local_path.join("tokenizer.json")).unwrap()
+        } else {
+            // Download and instantiate tokenizer
+            // We need to download it outside of the Tokio runtime
+            Tokenizer::from_pretrained(tokenizer_name.clone(), None).unwrap()
+        };
 
     // Launch Tokio runtime
     tokio::runtime::Builder::new_multi_thread()
@@ -75,6 +109,27 @@ fn main() -> Result<(), std::io::Error> {
         .block_on(async {
             init_logging(otlp_endpoint, json_output);
 
+            // Get pipeline tag
+            let model_info = reqwest::get(format!(
+                "https://huggingface.co/api/models/{tokenizer_name}"
+            ))
+            .await
+            .expect("Could not connect to hf.co")
+            .text()
+            .await
+            .expect("error when retrieving model info from hf.co");
+            let model_info: serde_json::Value =
+                serde_json::from_str(&model_info).expect("unable to parse model info");
+
+            // if pipeline-tag == text-generation we default to return_full_text = true
+            let compat_return_full_text = match model_info.get("pipeline_tag") {
+                None => {
+                    tracing::warn!("no pipeline tag found for model {tokenizer_name}");
+                    false
+                }
+                Some(pipeline_tag) => pipeline_tag.as_str() == Some("text-generation"),
+            };
+
             // Instantiate sharded client from the master unix socket
             let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
                 .await
@@ -91,14 +146,19 @@ fn main() -> Result<(), std::io::Error> {
 
             // Run server
             server::run(
+                compat_return_full_text,
                 max_concurrent_requests,
+                max_best_of,
+                max_stop_sequences,
                 max_input_length,
+                max_total_tokens,
                 max_batch_size,
                 max_waiting_tokens,
                 sharded_client,
                 tokenizer,
                 validation_workers,
                 addr,
+                cors_allow_origin,
             )
             .await;
             Ok(())
diff --git a/router/src/queue.rs b/router/src/queue.rs
index b155a1af..df2087e1 100644
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@@ -132,6 +132,7 @@ impl State {
         // Push entry in the queue
         self.entries.push((self.next_id, entry));
         self.next_id += 1;
+        metrics::increment_gauge!("tgi_queue_size", 1.0);
     }
 
     // Get the next batch
@@ -164,7 +165,8 @@ impl State {
                 // Create a new span to link the batch back to this entry
                 let entry_batch_span =
                     info_span!(parent: &entry.span, "infer", batch_size = next_batch_size);
-                // Add relationship
+                // Add relationships
+                next_batch_span.follows_from(&entry_batch_span);
                 entry_batch_span.follows_from(&next_batch_span);
                 // Update entry
                 entry.temp_span = Some(entry_batch_span);
@@ -172,7 +174,6 @@ impl State {
                 batch_requests.push(Request {
                     id,
                     inputs: entry.request.inputs.clone(),
-                    input_length: entry.request.input_length,
                     parameters: Some(entry.request.parameters.clone()),
                     stopping_parameters: Some(entry.request.stopping_parameters.clone()),
                 });
@@ -190,6 +191,8 @@ impl State {
         // Increment batch id
         self.next_batch_id += 1;
 
+        metrics::gauge!("tgi_queue_size", self.entries.len() as f64);
+        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
         Some((batch_entries, batch, next_batch_span))
     }
 }
@@ -223,14 +226,15 @@ mod tests {
         Entry {
             request: ValidGenerateRequest {
                 inputs: "".to_string(),
-                input_length: 0,
                 parameters: NextTokenChooserParameters {
                     temperature: 0.0,
                     top_k: 0,
                     top_p: 0.0,
+                    typical_p: 0.0,
                     do_sample: false,
                     seed: 0,
                     repetition_penalty: 0.0,
+                    watermark: false,
                 },
                 stopping_parameters: StoppingCriteriaParameters {
                     max_new_tokens: 0,
diff --git a/router/src/server.rs b/router/src/server.rs
index 432586bb..3b63ec8a 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1,17 +1,20 @@
 /// HTTP Server logic
-use crate::infer::{InferError, InferStreamResponse};
+use crate::infer::{InferError, InferResponse, InferStreamResponse};
+use crate::validation::ValidationError;
 use crate::{
-    Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, GenerateResponse,
-    Infer, StreamDetails, StreamResponse, Token, Validation,
+    BestOfSequence, CompatGenerateRequest, Details, ErrorResponse, FinishReason,
+    GenerateParameters, GenerateRequest, GenerateResponse, Infer, PrefillToken, StreamDetails,
+    StreamResponse, Token, Validation,
 };
 use axum::extract::Extension;
-use axum::http::{HeaderMap, StatusCode};
+use axum::http::{HeaderMap, Method, StatusCode};
 use axum::response::sse::{Event, KeepAlive, Sse};
-use axum::response::IntoResponse;
+use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
-use axum::{Json, Router};
+use axum::{http, Json, Router};
 use axum_tracing_opentelemetry::opentelemetry_tracing_layer;
 use futures::Stream;
+use metrics_exporter_prometheus::{PrometheusBuilder, PrometheusHandle};
 use std::convert::Infallible;
 use std::net::SocketAddr;
 use text_generation_client::ShardedClient;
@@ -19,29 +22,61 @@ use tokenizers::Tokenizer;
 use tokio::signal;
 use tokio::time::Instant;
 use tokio_stream::StreamExt;
+use tower_http::cors::{AllowOrigin, CorsLayer};
 use tracing::{info_span, instrument, Instrument};
 use utoipa::OpenApi;
 use utoipa_swagger_ui::SwaggerUi;
 
+/// Compatibility route with api-inference and AzureML
+#[instrument(skip(infer))]
+async fn compat_generate(
+    default_return_full_text: Extension<bool>,
+    infer: Extension<Infer>,
+    req: Json<CompatGenerateRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    let mut req = req.0;
+
+    // default return_full_text given the pipeline_tag
+    if req.parameters.return_full_text.is_none() {
+        req.parameters.return_full_text = Some(default_return_full_text.0)
+    }
+
+    // switch on stream
+    if req.stream {
+        Ok(generate_stream(infer, Json(req.into()))
+            .await
+            .into_response())
+    } else {
+        let (headers, generation) = generate(infer, Json(req.into())).await?;
+        // wrap generation inside a Vec to match api-inference
+        Ok((headers, Json(vec![generation.0])).into_response())
+    }
+}
+
 /// Health check method
 #[instrument(skip(infer))]
 async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
     // TODO: while this is the best health check we can do, it is a bit on the heavy side and might
     //       be a bit too slow for a health check.
-    //       What we should do instead if check if the gRPC channels are still healthy.
+    //       What we should do instead is check if the gRPC channels are still healthy.
 
     // Send a small inference request
     infer
         .generate(GenerateRequest {
             inputs: "liveness".to_string(),
             parameters: GenerateParameters {
+                best_of: None,
                 temperature: None,
                 repetition_penalty: None,
                 top_k: None,
                 top_p: None,
+                typical_p: None,
                 do_sample: false,
                 max_new_tokens: 1,
+                return_full_text: None,
                 stop: Vec::new(),
+                truncate: None,
+                watermark: false,
                 details: false,
                 seed: None,
             },
@@ -57,15 +92,15 @@ async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorRe
     path = "/generate",
     request_body = GenerateRequest,
     responses(
-        (status = 200, description = "Generated Text", body = [GenerateResponse]),
-        (status = 424, description = "Generation Error", body = [ErrorResponse],
-            example = json!({"error": "Request failed during generation"})),
-        (status = 429, description = "Model is overloaded", body = [ErrorResponse],
-            example = json!({"error": "Model is overloaded"})),
-        (status = 422, description = "Input validation error", body = [ErrorResponse],
-            example = json!({"error": "Input validation error"})),
-        (status = 500, description = "Incomplete generation", body = [ErrorResponse],
-            example = json!({"error": "Incomplete generation"})),
+        (status = 200, description = "Generated Text", body = GenerateResponse),
+        (status = 424, description = "Generation Error", body = ErrorResponse,
+            example = json ! ({"error": "Request failed during generation"})),
+        (status = 429, description = "Model is overloaded", body = ErrorResponse,
+            example = json ! ({"error": "Model is overloaded"})),
+        (status = 422, description = "Input validation error", body = ErrorResponse,
+            example = json ! ({"error": "Input validation error"})),
+        (status = 500, description = "Incomplete generation", body = ErrorResponse,
+            example = json ! ({"error": "Incomplete generation"})),
     )
 )]
 #[instrument(
@@ -82,23 +117,64 @@ async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorRe
 async fn generate(
     infer: Extension<Infer>,
     req: Json<GenerateRequest>,
-) -> Result<impl IntoResponse, (StatusCode, Json<ErrorResponse>)> {
+) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
     let span = tracing::Span::current();
     let start_time = Instant::now();
 
-    // Inference
+    let compute_characters = req.0.inputs.chars().count();
+    let mut add_prompt = None;
+    if req.0.parameters.return_full_text.unwrap_or(false) {
+        add_prompt = Some(req.0.inputs.clone());
+    }
+
     let details = req.0.parameters.details;
-    let response = infer.generate(req.0).await?;
+
+    // Inference
+    let (response, best_of_responses) = match req.0.parameters.best_of {
+        Some(best_of) if best_of > 1 => {
+            let (response, best_of_responses) = infer.generate_best_of(req.0, best_of).await?;
+            (response, Some(best_of_responses))
+        }
+        _ => (infer.generate(req.0).await?, None),
+    };
 
     // Token details
     let details = match details {
-        true => Some(Details {
-            finish_reason: FinishReason::from(response.generated_text.finish_reason),
-            generated_tokens: response.generated_text.generated_tokens,
-            prefill: Some(response.prefill),
-            tokens: Some(response.tokens),
-            seed: response.generated_text.seed,
-        }),
+        true => {
+            // convert best_of_responses
+            let best_of_sequences = best_of_responses.map(|responses: Vec<InferResponse>| {
+                responses
+                    .into_iter()
+                    .map(|response: InferResponse| {
+                        // Add prompt if return_full_text
+                        let mut output_text = response.generated_text.text;
+                        if let Some(prompt) = &add_prompt {
+                            output_text = prompt.clone() + &output_text;
+                        }
+
+                        BestOfSequence {
+                            generated_text: output_text,
+                            finish_reason: FinishReason::from(
+                                response.generated_text.finish_reason,
+                            ),
+                            generated_tokens: response.generated_text.generated_tokens,
+                            prefill: response.prefill,
+                            tokens: response.tokens,
+                            seed: response.generated_text.seed,
+                        }
+                    })
+                    .collect()
+            });
+
+            Some(Details {
+                finish_reason: FinishReason::from(response.generated_text.finish_reason),
+                generated_tokens: response.generated_text.generated_tokens,
+                prefill: response.prefill,
+                tokens: response.tokens,
+                seed: response.generated_text.seed,
+                best_of_sequences,
+            })
+        }
         false => None,
     };
 
@@ -111,6 +187,15 @@ async fn generate(
 
     // Headers
     let mut headers = HeaderMap::new();
+    headers.insert("x-compute-type", "gpu+optimized".parse().unwrap());
+    headers.insert(
+        "x-compute-time",
+        total_time.as_millis().to_string().parse().unwrap(),
+    );
+    headers.insert(
+        "x-compute-characters",
+        compute_characters.to_string().parse().unwrap(),
+    );
     headers.insert(
         "x-total-time",
         total_time.as_millis().to_string().parse().unwrap(),
@@ -141,9 +226,26 @@ async fn generate(
     span.record("seed", format!("{:?}", response.generated_text.seed));
     tracing::info!("Output: {}", response.generated_text.text);
 
+    // Metrics
+    metrics::increment_counter!("tgi_request_success");
+    metrics::histogram!("tgi_request_duration", total_time);
+    metrics::histogram!("tgi_request_validation_duration", validation_time);
+    metrics::histogram!("tgi_request_queue_duration", queue_time);
+    metrics::histogram!("tgi_request_inference_duration", inference_time);
+    metrics::histogram!("tgi_request_mean_time_per_token_duration", time_per_token);
+    metrics::histogram!(
+        "tgi_request_generated_tokens",
+        response.generated_text.generated_tokens as f64
+    );
+
     // Send response
+    let mut output_text = response.generated_text.text;
+    if let Some(prompt) = add_prompt {
+        output_text = prompt + &output_text;
+    }
+
     let response = GenerateResponse {
-        generated_text: response.generated_text.text,
+        generated_text: output_text,
         details,
     };
     Ok((headers, Json(response)))
@@ -156,20 +258,20 @@ async fn generate(
     path = "/generate_stream",
     request_body = GenerateRequest,
     responses(
-        (status = 200, description = "Generated Text", body = [StreamResponse],
-            content_type="text/event-stream "),
-        (status = 424, description = "Generation Error", body = [ErrorResponse],
-            example = json!({"error": "Request failed during generation"}),
-            content_type="text/event-stream "),
-        (status = 429, description = "Model is overloaded", body = [ErrorResponse],
-            example = json!({"error": "Model is overloaded"}),
-            content_type="text/event-stream "),
-        (status = 422, description = "Input validation error", body = [ErrorResponse],
-            example = json!({"error": "Input validation error"}),
-            content_type="text/event-stream "),
-        (status = 500, description = "Incomplete generation", body = [ErrorResponse],
-            example = json!({"error": "Incomplete generation"}),
-            content_type="text/event-stream "),
+        (status = 200, description = "Generated Text", body = StreamResponse,
+            content_type = "text/event-stream"),
+        (status = 424, description = "Generation Error", body = ErrorResponse,
+            example = json ! ({"error": "Request failed during generation"}),
+            content_type = "text/event-stream"),
+        (status = 429, description = "Model is overloaded", body = ErrorResponse,
+            example = json ! ({"error": "Model is overloaded"}),
+            content_type = "text/event-stream"),
+        (status = 422, description = "Input validation error", body = ErrorResponse,
+            example = json ! ({"error": "Input validation error"}),
+            content_type = "text/event-stream"),
+        (status = 500, description = "Incomplete generation", body = ErrorResponse,
+            example = json ! ({"error": "Incomplete generation"}),
+            content_type = "text/event-stream"),
     )
 )]
 #[instrument(
@@ -186,118 +288,177 @@ async fn generate(
 async fn generate_stream(
     infer: Extension<Infer>,
     req: Json<GenerateRequest>,
-) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
+) -> (
+    HeaderMap,
+    Sse<impl Stream<Item = Result<Event, Infallible>>>,
+) {
     let span = tracing::Span::current();
     let start_time = Instant::now();
 
+    let compute_characters = req.0.inputs.chars().count();
+
+    let mut headers = HeaderMap::new();
+    headers.insert("x-compute-type", "gpu+optimized".parse().unwrap());
+    headers.insert(
+        "x-compute-characters",
+        compute_characters.to_string().parse().unwrap(),
+    );
+
     let stream = async_stream::stream! {
         // Inference
         let mut end_reached = false;
         let mut error = false;
+
+        let mut add_prompt = None;
+        if req.0.parameters.return_full_text.unwrap_or(false) {
+            add_prompt = Some(req.0.inputs.clone());
+        }
         let details = req.0.parameters.details;
 
-        match infer.generate_stream(req.0).instrument(info_span!(parent: &span, "async_stream")).await {
-            Ok(mut response_stream) => {
-                // Server-Sent Event stream
-                while let Some(response) = response_stream.next().await {
-                    match response {
-                        Ok(response) => {
-                            match response {
-                                // Prefill is ignored
-                                InferStreamResponse::Prefill(_) => {}
-                                // Yield event for every new token
-                                InferStreamResponse::Token(token) => {
-                                    // StreamResponse
-                                    let stream_token = StreamResponse {
+        let best_of = req.0.parameters.best_of.unwrap_or(1);
+        if best_of == 1 {
+            match infer.generate_stream(req.0).instrument(info_span!(parent: &span, "async_stream")).await {
+                Ok(mut response_stream) => {
+                    // Server-Sent Event stream
+                    while let Some(response) = response_stream.next().await {
+                        match response {
+                            Ok(response) => {
+                                match response {
+                                    // Prefill is ignored
+                                    InferStreamResponse::Prefill(_) => {}
+                                    // Yield event for every new token
+                                    InferStreamResponse::Token(token) => {
+                                        // StreamResponse
+                                        let stream_token = StreamResponse {
+                                            token,
+                                            generated_text: None,
+                                            details: None,
+                                        };
+
+                                        yield Ok(Event::default().json_data(stream_token).unwrap())
+                                    }
+                                    // Yield event for last token and compute timings
+                                    InferStreamResponse::End {
                                         token,
-                                        generated_text: None,
-                                        details: None,
-                                    };
+                                        generated_text,
+                                        start,
+                                        queued,
+                                    } => {
+                                        // Token details
+                                        let details = match details {
+                                            true => Some(StreamDetails {
+                                                finish_reason: FinishReason::from(generated_text.finish_reason),
+                                                generated_tokens: generated_text.generated_tokens,
+                                                seed: generated_text.seed,
+                                            }),
+                                            false => None,
+                                        };
 
-                                    yield Ok(Event::default().json_data(stream_token).unwrap())
-                                }
-                                // Yield event for last token and compute timings
-                                InferStreamResponse::End {
-                                    token,
-                                    generated_text,
-                                    start,
-                                    queued,
-                                } => {
-                                    // Token details
-                                    let details = match details {
-                                        true => Some(StreamDetails {
-                                            finish_reason: FinishReason::from(generated_text.finish_reason),
-                                            generated_tokens: generated_text.generated_tokens,
-                                            seed: generated_text.seed,
-                                        }),
-                                        false => None,
-                                    };
+                                        // Timings
+                                        let total_time = start_time.elapsed();
+                                        let validation_time = queued - start_time;
+                                        let queue_time = start - queued;
+                                        let inference_time = Instant::now() - start;
+                                        let time_per_token = inference_time / generated_text.generated_tokens;
 
-                                    // Timings
-                                    let total_time = start_time.elapsed();
-                                    let validation_time = queued - start_time;
-                                    let queue_time = start - queued;
-                                    let inference_time = Instant::now() - start;
-                                    let time_per_token = inference_time / generated_text.generated_tokens;
+                                        // Tracing metadata
+                                        span.record("total_time", format!("{total_time:?}"));
+                                        span.record("validation_time", format!("{validation_time:?}"));
+                                        span.record("queue_time", format!("{queue_time:?}"));
+                                        span.record("inference_time", format!("{inference_time:?}"));
+                                        span.record("time_per_token", format!("{time_per_token:?}"));
+                                        span.record("seed", format!("{:?}", generated_text.seed));
+                                        tracing::info!(parent: &span, "Output: {}", generated_text.text);
 
-                                    // Tracing metadata
-                                    span.record("total_time", format!("{:?}", total_time));
-                                    span.record("validation_time", format!("{:?}", validation_time));
-                                    span.record("queue_time", format!("{:?}", queue_time));
-                                    span.record("inference_time", format!("{:?}", inference_time));
-                                    span.record("time_per_token", format!("{:?}", time_per_token));
-                                    span.record("seed", format!("{:?}", generated_text.seed));
-                                    tracing::info!(parent: &span, "Output: {}", generated_text.text);
+                                        // Metrics
+                                        metrics::increment_counter!("tgi_request_success");
+                                        metrics::histogram!("tgi_request_duration", total_time);
+                                        metrics::histogram!("tgi_request_validation_duration", validation_time);
+                                        metrics::histogram!("tgi_request_queue_duration", queue_time);
+                                        metrics::histogram!("tgi_request_inference_duration", inference_time);
+                                        metrics::histogram!("tgi_request_mean_time_per_token_duration", time_per_token);
+                                        metrics::histogram!("tgi_request_generated_tokens", generated_text.generated_tokens as f64);
 
-                                    // StreamResponse
-                                    end_reached = true;
-                                    let stream_token = StreamResponse {
-                                        token,
-                                        generated_text: Some(generated_text.text),
-                                        details
-                                    };
+                                        // StreamResponse
+                                        end_reached = true;
 
-                                    yield Ok(Event::default().json_data(stream_token).unwrap())
+                                        let mut output_text = generated_text.text;
+                                        if let Some(prompt) = add_prompt {
+                                            output_text = prompt + &output_text;
+                                        }
+
+                                        let stream_token = StreamResponse {
+                                            token,
+                                            generated_text: Some(output_text),
+                                            details
+                                        };
+
+                                        yield Ok(Event::default().json_data(stream_token).unwrap());
+                                        break;
+                                    }
                                 }
                             }
-                        }
-                        // yield error
-                        Err(err) => {
-                            error = true;
-                            yield Ok(Event::from(err))
+                            // yield error
+                            Err(err) => {
+                                error = true;
+                                yield Ok(Event::from(err));
+                                break;
+                            }
                         }
                     }
+                },
+                // yield error
+                Err(err) => {
+                    error = true;
+                    yield Ok(Event::from(err));
                 }
-            },
-            // yield error
-            Err(err) => {
-                error = true;
-                yield Ok(Event::from(err))
             }
-        }
-        // Check if generation reached the end
-        // Skip if we already sent an error
-        if !end_reached && !error {
-            let err = InferError::IncompleteGeneration;
+            // Check if generation reached the end
+            // Skip if we already sent an error
+            if !end_reached && !error {
+                let err = InferError::IncompleteGeneration;
+                metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
+                tracing::error!("{err}");
+                yield Ok(Event::from(err));
+            }
+        } else {
+            let err = InferError::from(ValidationError::BestOfStream);
+            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
             tracing::error!("{err}");
-            yield Ok(Event::from(err))
+            yield Ok(Event::from(err));
         }
     };
 
-    Sse::new(stream).keep_alive(KeepAlive::default())
+    (headers, Sse::new(stream).keep_alive(KeepAlive::default()))
+}
+
+/// Prometheus metrics scrape endpoint
+#[utoipa::path(
+    get,
+    tag = "Text Generation Inference",
+    path = "/metrics",
+    responses((status = 200, description = "Prometheus Metrics", body = String))
+)]
+async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
+    prom_handle.render()
 }
 
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
+    compat_return_full_text: bool,
     max_concurrent_requests: usize,
+    max_best_of: usize,
+    max_stop_sequences: usize,
     max_input_length: usize,
+    max_total_tokens: usize,
     max_batch_size: usize,
     max_waiting_tokens: usize,
     client: ShardedClient,
     tokenizer: Tokenizer,
     validation_workers: usize,
     addr: SocketAddr,
+    allow_origin: Option<AllowOrigin>,
 ) {
     // OpenAPI documentation
     #[derive(OpenApi)]
@@ -305,13 +466,16 @@ pub async fn run(
         paths(
             generate,
             generate_stream,
+            metrics,
         ),
         components(
             schemas(
                 GenerateRequest,
                 GenerateParameters,
+                PrefillToken,
                 Token,
                 GenerateResponse,
+                BestOfSequence,
                 Details,
                 FinishReason,
                 StreamResponse,
@@ -333,7 +497,14 @@ pub async fn run(
     struct ApiDoc;
 
     // Create state
-    let validation = Validation::new(validation_workers, tokenizer, max_input_length);
+    let validation = Validation::new(
+        validation_workers,
+        tokenizer,
+        max_best_of,
+        max_stop_sequences,
+        max_input_length,
+        max_total_tokens,
+    );
     let infer = Infer::new(
         client,
         validation,
@@ -342,16 +513,33 @@ pub async fn run(
         max_concurrent_requests,
     );
 
+    // Prometheus handler
+    let builder = PrometheusBuilder::new();
+    let prom_handle = builder
+        .install_recorder()
+        .expect("failed to install metrics recorder");
+
+    // CORS layer
+    let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
+    let cors_layer = CorsLayer::new()
+        .allow_methods([Method::GET, Method::POST])
+        .allow_headers([http::header::CONTENT_TYPE])
+        .allow_origin(allow_origin);
+
     // Create router
     let app = Router::new()
         .merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", ApiDoc::openapi()))
-        .route("/", post(generate))
+        .route("/", post(compat_generate))
         .route("/generate", post(generate))
         .route("/generate_stream", post(generate_stream))
         .route("/", get(health))
         .route("/health", get(health))
+        .route("/metrics", get(metrics))
+        .layer(Extension(compat_return_full_text))
         .layer(Extension(infer))
-        .layer(opentelemetry_tracing_layer());
+        .layer(Extension(prom_handle))
+        .layer(opentelemetry_tracing_layer())
+        .layer(cors_layer);
 
     // Run server
     axum::Server::bind(&addr)
@@ -415,6 +603,7 @@ impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
             status_code,
             Json(ErrorResponse {
                 error: err.to_string(),
+                error_type: err.error_type().to_string(),
             }),
         )
     }
@@ -425,6 +614,7 @@ impl From<InferError> for Event {
         Event::default()
             .json_data(ErrorResponse {
                 error: err.to_string(),
+                error_type: err.error_type().to_string(),
             })
             .unwrap()
     }
diff --git a/router/src/validation.rs b/router/src/validation.rs
index aa1c1d23..1c350caa 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -1,3 +1,4 @@
+use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
 /// Payload validation logic
 use crate::{GenerateParameters, GenerateRequest};
 use rand::rngs::ThreadRng;
@@ -5,33 +6,44 @@ use rand::Rng;
 use text_generation_client::{NextTokenChooserParameters, StoppingCriteriaParameters};
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
+use tokenizers::TruncationDirection;
 use tokio::sync::{mpsc, oneshot};
 use tracing::{instrument, Span};
 
-const MAX_MAX_NEW_TOKENS: u32 = 512;
-const MAX_STOP_SEQUENCES: usize = 4;
-
 /// Validation
 #[derive(Debug, Clone)]
 pub struct Validation {
+    /// maximum value for the best_of parameter
+    #[allow(dead_code)]
+    max_best_of: usize,
     /// Channel to communicate with the background validation task
-    sender: mpsc::Sender<ValidationRequest>,
+    sender: mpsc::UnboundedSender<ValidationRequest>,
 }
 
 impl Validation {
-    pub(crate) fn new(workers: usize, tokenizer: Tokenizer, max_input_length: usize) -> Self {
+    pub(crate) fn new(
+        workers: usize,
+        tokenizer: Tokenizer,
+        max_best_of: usize,
+        max_stop_sequences: usize,
+        max_input_length: usize,
+        max_total_tokens: usize,
+    ) -> Self {
         // Create channel
-        let (validation_sender, validation_receiver) = mpsc::channel(128);
+        let (validation_sender, validation_receiver) = mpsc::unbounded_channel();
 
         // Launch background validation task
         tokio::spawn(validation_task(
             workers,
             tokenizer,
+            max_stop_sequences,
             max_input_length,
+            max_total_tokens,
             validation_receiver,
         ));
 
         Self {
+            max_best_of,
             sender: validation_sender,
         }
     }
@@ -48,12 +60,25 @@ impl Validation {
         // Unwrap is safe here
         self.sender
             .send((request, sender, Span::current()))
-            .await
             .unwrap();
         // Await on response channel
         // Unwrap is safe here
         receiver.await.unwrap()
     }
+
+    /// Validate the best_of parameter
+    #[instrument(skip_all)]
+    pub(crate) fn validate_best_of(&self, best_of: usize) -> Result<usize, ValidationError> {
+        if self.max_best_of == 1 && best_of != 1 {
+            return Err(ValidationError::BestOfDisabled);
+        }
+
+        if best_of > self.max_best_of {
+            return Err(ValidationError::BestOf(self.max_best_of, best_of));
+        }
+
+        Ok(best_of)
+    }
 }
 
 /// Validation task
@@ -61,8 +86,10 @@ impl Validation {
 async fn validation_task(
     workers: usize,
     tokenizer: Tokenizer,
+    max_stop_sequences: usize,
     max_input_length: usize,
-    mut receiver: mpsc::Receiver<ValidationRequest>,
+    max_total_tokens: usize,
+    mut receiver: mpsc::UnboundedReceiver<ValidationRequest>,
 ) {
     let mut workers_senders = Vec::with_capacity(workers);
 
@@ -75,7 +102,13 @@ async fn validation_task(
 
         // Spawn worker
         tokio::task::spawn_blocking(move || {
-            validation_worker(tokenizer_clone, max_input_length, worker_receiver)
+            validation_worker(
+                tokenizer_clone,
+                max_stop_sequences,
+                max_input_length,
+                max_total_tokens,
+                worker_receiver,
+            )
         });
     }
 
@@ -95,7 +128,9 @@ async fn validation_task(
 /// the tokenizer
 fn validation_worker(
     tokenizer: Tokenizer,
+    max_stop_sequences: usize,
     max_input_length: usize,
+    max_total_tokens: usize,
     mut receiver: mpsc::Receiver<ValidationRequest>,
 ) {
     // Seed rng
@@ -106,7 +141,16 @@ fn validation_worker(
         parent_span.in_scope(|| {
             response_tx
                 .send(
-                    validate(request, &tokenizer, max_input_length, &mut rng).map_err(|err| {
+                    validate(
+                        request,
+                        &tokenizer,
+                        max_stop_sequences,
+                        max_input_length,
+                        max_total_tokens,
+                        &mut rng,
+                    )
+                    .map_err(|err| {
+                        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
                         tracing::error!("{err}");
                         err
                     }),
@@ -119,21 +163,39 @@ fn validation_worker(
 fn validate(
     request: GenerateRequest,
     tokenizer: &Tokenizer,
+    max_stop_sequences: usize,
     max_input_length: usize,
+    max_total_tokens: usize,
     rng: &mut ThreadRng,
 ) -> Result<ValidGenerateRequest, ValidationError> {
     let GenerateParameters {
+        best_of,
         temperature,
         repetition_penalty,
         top_k,
         top_p,
+        typical_p,
         do_sample,
         max_new_tokens,
         stop: stop_sequences,
+        truncate,
         seed,
+        watermark,
         ..
     } = request.parameters;
 
+    // sampling must be true when best_of > 1
+    let best_of = best_of.unwrap_or(1);
+    let sampling = do_sample
+        || temperature.is_some()
+        || top_k.is_some()
+        || top_p.is_some()
+        || typical_p.is_some();
+
+    if best_of > 1 && !sampling {
+        return Err(BestOfSampling);
+    }
+
     let temperature = temperature.unwrap_or(1.0);
     if temperature <= 0.0 {
         return Err(ValidationError::Temperature);
@@ -144,30 +206,42 @@ fn validate(
         return Err(ValidationError::RepetitionPenalty);
     }
 
-    let top_p = top_p.unwrap_or(1.0);
-    if top_p <= 0.0 || top_p > 1.0 {
-        return Err(ValidationError::TopP);
-    }
-
-    // Different because the proto default value is 0 while it is not a valid value
+    // Different because the proto default value is not a valid value
     // for the user
-    let top_k: u32 = match top_k {
-        None => Ok(0),
-        Some(top_k) => {
-            if top_k <= 0 {
+    let top_p = top_p
+        .map(|value| {
+            if value <= 0.0 || value >= 1.0 {
+                return Err(ValidationError::TopP);
+            }
+            Ok(value)
+        })
+        .unwrap_or(Ok(1.0))?;
+
+    let typical_p = typical_p
+        .map(|value| {
+            if value <= 0.0 || value >= 1.0 {
+                return Err(ValidationError::TypicalP);
+            }
+            Ok(value)
+        })
+        .unwrap_or(Ok(1.0))?;
+
+    let top_k: u32 = top_k
+        .map(|value| {
+            if value <= 0 {
                 return Err(ValidationError::TopK);
             }
-            Ok(top_k as u32)
-        }
-    }?;
+            Ok(value as u32)
+        })
+        .unwrap_or(Ok(0))?;
 
-    if max_new_tokens == 0 || max_new_tokens > MAX_MAX_NEW_TOKENS {
-        return Err(ValidationError::MaxNewTokens(MAX_MAX_NEW_TOKENS));
+    if max_new_tokens == 0 {
+        return Err(ValidationError::MaxNewTokens);
     }
 
-    if stop_sequences.len() > MAX_STOP_SEQUENCES {
+    if stop_sequences.len() > max_stop_sequences {
         return Err(ValidationError::StopSequence(
-            MAX_STOP_SEQUENCES,
+            max_stop_sequences,
             stop_sequences.len(),
         ));
     }
@@ -175,41 +249,82 @@ fn validate(
     // If seed is None, assign a random one
     let seed = match seed {
         None => rng.gen(),
-        Some(seed) => seed,
+        Some(seed) => {
+            if best_of > 1 {
+                return Err(BestOfSeed);
+            }
+            seed
+        }
     };
 
-    // Get the number of tokens in the input
-    match tokenizer.encode(request.inputs.clone(), true) {
-        Ok(encoding) => {
-            let input_length = encoding.len();
-
-            if input_length > max_input_length {
-                Err(ValidationError::InputLength(input_length, max_input_length))
-            } else {
-                // Return ValidGenerateRequest
-                let parameters = NextTokenChooserParameters {
-                    temperature,
-                    repetition_penalty,
-                    top_k,
-                    top_p,
-                    do_sample,
-                    seed,
-                };
-                let stopping_parameters = StoppingCriteriaParameters {
-                    max_new_tokens,
-                    stop_sequences,
-                };
-
-                Ok(ValidGenerateRequest {
-                    inputs: request.inputs,
-                    input_length: input_length as u32,
-                    parameters,
-                    stopping_parameters,
-                })
-            }
-        }
-        Err(err) => Err(ValidationError::Tokenizer(err.to_string())),
+    // Check if inputs is empty
+    if request.inputs.is_empty() {
+        return Err(EmptyInput);
     }
+
+    // Check if truncate is strictly positive and less than max_input_length
+    let truncate = truncate
+        .map(|value| {
+            if value == 0 || value > max_input_length {
+                return Err(ValidationError::Truncate(max_input_length, value));
+            }
+            Ok(Some(value))
+        })
+        .unwrap_or(Ok(None))?;
+
+    // Get the number of tokens in the input
+    let mut encoding = tokenizer
+        .encode(request.inputs.clone(), true)
+        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
+
+    let (inputs, input_length) = if let Some(truncate) = truncate {
+        // truncate encoding and decode new inputs
+        encoding.truncate(truncate, 0, TruncationDirection::Left);
+        let inputs = tokenizer
+            .decode(Vec::from(encoding.get_ids()), false)
+            .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
+        (inputs, encoding.len())
+    } else {
+        (request.inputs, encoding.len())
+    };
+
+    if input_length > max_input_length {
+        return Err(ValidationError::InputLength(max_input_length, input_length));
+    }
+
+    let total_tokens = input_length + max_new_tokens as usize;
+    if total_tokens > max_total_tokens {
+        return Err(ValidationError::MaxTotalTokens(
+            max_total_tokens,
+            input_length,
+            max_new_tokens,
+        ));
+    }
+
+    // Return ValidGenerateRequest
+    let parameters = NextTokenChooserParameters {
+        temperature,
+        repetition_penalty,
+        top_k,
+        top_p,
+        typical_p,
+        do_sample,
+        seed,
+        watermark,
+    };
+    let stopping_parameters = StoppingCriteriaParameters {
+        max_new_tokens,
+        stop_sequences,
+    };
+
+    metrics::histogram!("tgi_request_input_length", input_length as f64);
+    metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);
+
+    Ok(ValidGenerateRequest {
+        inputs,
+        parameters,
+        stopping_parameters,
+    })
 }
 
 type ValidationRequest = (
@@ -221,26 +336,43 @@ type ValidationRequest = (
 #[derive(Debug)]
 pub(crate) struct ValidGenerateRequest {
     pub inputs: String,
-    pub input_length: u32,
     pub parameters: NextTokenChooserParameters,
     pub stopping_parameters: StoppingCriteriaParameters,
 }
 
 #[derive(Error, Debug)]
 pub enum ValidationError {
-    #[error("temperature must be strictly positive")]
+    #[error("`best_of` must be > 0 and <= {0}. Given: {1}")]
+    BestOf(usize, usize),
+    #[error("`best_of` != 1 is not allowed for this endpoint")]
+    BestOfDisabled,
+    #[error("you must use sampling when `best_of` is > 1")]
+    BestOfSampling,
+    #[error("`seed` must not be set when `best_of` > 1")]
+    BestOfSeed,
+    #[error("`best_of` != 1 is not supported when streaming tokens")]
+    BestOfStream,
+    #[error("`temperature` must be strictly positive")]
     Temperature,
-    #[error("repetition_penalty must be strictly positive")]
+    #[error("`repetition_penalty` must be strictly positive")]
     RepetitionPenalty,
-    #[error("top_p must be > 0.0 and <= 1.0")]
+    #[error("`top_p` must be > 0.0 and < 1.0")]
     TopP,
-    #[error("top_k must be strictly positive")]
+    #[error("`top_k` must be strictly positive")]
     TopK,
-    #[error("max_new_tokens must be strictly positive and <= {0}")]
-    MaxNewTokens(u32),
-    #[error("inputs must have less than {1} tokens. Given: {0}")]
+    #[error("`truncate` must be strictly positive and less than {0}. Given: {1}")]
+    Truncate(usize, usize),
+    #[error("`typical_p` must be > 0.0 and < 1.0")]
+    TypicalP,
+    #[error("`max_new_tokens` must be strictly positive")]
+    MaxNewTokens,
+    #[error("`inputs` tokens + `max_new_tokens` must be <= {0}. Given: {1} `inputs` tokens and {2} `max_new_tokens`")]
+    MaxTotalTokens(usize, usize, u32),
+    #[error("`inputs` must have less than {0} tokens. Given: {1}")]
     InputLength(usize, usize),
-    #[error("stop supports up to {0} stop sequences. Given: {1}")]
+    #[error("`inputs` cannot be empty")]
+    EmptyInput,
+    #[error("`stop` supports up to {0} stop sequences. Given: {1}")]
     StopSequence(usize, usize),
     #[error("tokenizer error {0}")]
     Tokenizer(String),
diff --git a/server/.gitignore b/server/.gitignore
index 5758ba92..aef74bb4 100644
--- a/server/.gitignore
+++ b/server/.gitignore
@@ -1,7 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
-text_generation/__pycache__/
-text_generation/pb/__pycache__/
+text_generation_server/__pycache__/
+text_generation_server/pb/__pycache__/
 *.py[cod]
 *$py.class
 
diff --git a/server/Makefile b/server/Makefile
index 74ce5144..e8b0364e 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -1,20 +1,22 @@
+transformers_commit := 2b57aa18da658e7d2f42ef6bd5b56751af582fef
+
 gen-server:
 	# Compile protos
 	pip install grpcio-tools==1.51.1 --no-cache-dir
-	mkdir text_generation/pb || true
-	python -m grpc_tools.protoc -I../proto --python_out=text_generation/pb --grpc_python_out=text_generation/pb ../proto/generate.proto
-	find text_generation/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
-	touch text_generation/pb/__init__.py
+	mkdir text_generation_server/pb || true
+	python -m grpc_tools.protoc -I../proto --python_out=text_generation_server/pb --grpc_python_out=text_generation_server/pb ../proto/generate.proto
+	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch text_generation_server/pb/__init__.py
 
 install-transformers:
 	# Install specific version of transformers with custom cuda kernels
 	pip uninstall transformers -y || true
 	rm -rf transformers || true
-	rm -rf transformers-text_generation_inference || true
-	curl -L -O https://github.com/OlivierDehaene/transformers/archive/refs/heads/text_generation_inference.zip
-	unzip text_generation_inference.zip
-	rm text_generation_inference.zip
-	mv transformers-text_generation_inference transformers
+	rm -rf transformers-$(transformers_commit) || true
+	curl -L -O https://github.com/OlivierDehaene/transformers/archive/$(transformers_commit).zip
+	unzip $(transformers_commit).zip
+	rm $(transformers_commit).zip
+	mv transformers-$(transformers_commit) transformers
 	cd transformers && python setup.py install
 
 install-torch:
@@ -26,4 +28,4 @@ install: gen-server install-torch install-transformers
 	pip install -e . --no-cache-dir
 
 run-dev:
-	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation/cli.py serve bigscience/bloom-560m --sharded
\ No newline at end of file
+	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
\ No newline at end of file
diff --git a/server/poetry.lock b/server/poetry.lock
index 0e0655cb..89ad31e9 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -145,30 +145,30 @@ testing = ["protobuf (>=3.6.0)"]
 
 [[package]]
 name = "grpcio"
-version = "1.51.1"
+version = "1.51.3"
 description = "HTTP/2-based RPC framework"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.51.1)"]
+protobuf = ["grpcio-tools (>=1.51.3)"]
 
 [[package]]
 name = "grpcio-reflection"
-version = "1.51.1"
+version = "1.51.3"
 description = "Standard Protobuf Reflection Service for gRPC"
 category = "main"
 optional = false
 python-versions = ">=3.6"
 
 [package.dependencies]
-grpcio = ">=1.51.1"
+grpcio = ">=1.51.3"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-status"
-version = "1.51.1"
+version = "1.51.3"
 description = "Status proto mapping for gRPC"
 category = "main"
 optional = false
@@ -176,22 +176,30 @@ python-versions = ">=3.6"
 
 [package.dependencies]
 googleapis-common-protos = ">=1.5.5"
-grpcio = ">=1.51.1"
+grpcio = ">=1.51.3"
 protobuf = ">=4.21.6"
 
 [[package]]
 name = "grpcio-tools"
-version = "1.51.1"
+version = "1.51.3"
 description = "Protobuf code generator for gRPC"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 
 [package.dependencies]
-grpcio = ">=1.51.1"
+grpcio = ">=1.51.3"
 protobuf = ">=4.21.6,<5.0dev"
 setuptools = "*"
 
+[[package]]
+name = "hf-transfer"
+version = "0.1.2"
+description = ""
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -428,7 +436,7 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "protobuf"
-version = "4.21.12"
+version = "4.22.0"
 description = ""
 category = "main"
 optional = false
@@ -511,7 +519,7 @@ torch = ["torch"]
 
 [[package]]
 name = "setuptools"
-version = "67.2.0"
+version = "67.4.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 category = "main"
 optional = false
@@ -567,7 +575,7 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.
 
 [[package]]
 name = "typing-extensions"
-version = "4.4.0"
+version = "4.5.0"
 description = "Backported and Experimental Type Hints for Python 3.7+"
 category = "main"
 optional = false
@@ -610,7 +618,7 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 
 [[package]]
 name = "wrapt"
-version = "1.14.1"
+version = "1.15.0"
 description = "Module for decorators, wrappers and monkey patching."
 category = "main"
 optional = false
@@ -622,7 +630,7 @@ bnb = ["bitsandbytes"]
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "f3cab6881b52045770a90ec9be7415a0ee499d9e980892d544f68073700cf321"
+content-hash = "521dc9f3c283dc56f7d2e2f96759919ff27ab49ffd3ae7cd26317b209e7fa98d"
 
 [metadata.files]
 accelerate = [
@@ -760,106 +768,127 @@ grpc-interceptor = [
     {file = "grpc_interceptor-0.15.0-py3-none-any.whl", hash = "sha256:63e390162e64df96c39c40508eb697def76a7cafac32a7eaf9272093eec1109e"},
 ]
 grpcio = [
-    {file = "grpcio-1.51.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:cc2bece1737b44d878cc1510ea04469a8073dbbcdd762175168937ae4742dfb3"},
-    {file = "grpcio-1.51.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:e223a9793522680beae44671b9ed8f6d25bbe5ddf8887e66aebad5e0686049ef"},
-    {file = "grpcio-1.51.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:24ac1154c4b2ab4a0c5326a76161547e70664cd2c39ba75f00fc8a2170964ea2"},
-    {file = "grpcio-1.51.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4ef09f8997c4be5f3504cefa6b5c6cc3cf648274ce3cede84d4342a35d76db6"},
-    {file = "grpcio-1.51.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8a0b77e992c64880e6efbe0086fe54dfc0bbd56f72a92d9e48264dcd2a3db98"},
-    {file = "grpcio-1.51.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:eacad297ea60c72dd280d3353d93fb1dcca952ec11de6bb3c49d12a572ba31dd"},
-    {file = "grpcio-1.51.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:16c71740640ba3a882f50b01bf58154681d44b51f09a5728180a8fdc66c67bd5"},
-    {file = "grpcio-1.51.1-cp310-cp310-win32.whl", hash = "sha256:29cb97d41a4ead83b7bcad23bdb25bdd170b1e2cba16db6d3acbb090bc2de43c"},
-    {file = "grpcio-1.51.1-cp310-cp310-win_amd64.whl", hash = "sha256:9ff42c5620b4e4530609e11afefa4a62ca91fa0abb045a8957e509ef84e54d30"},
-    {file = "grpcio-1.51.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:bc59f7ba87972ab236f8669d8ca7400f02a0eadf273ca00e02af64d588046f02"},
-    {file = "grpcio-1.51.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:3c2b3842dcf870912da31a503454a33a697392f60c5e2697c91d133130c2c85d"},
-    {file = "grpcio-1.51.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22b011674090594f1f3245960ced7386f6af35485a38901f8afee8ad01541dbd"},
-    {file = "grpcio-1.51.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49d680356a975d9c66a678eb2dde192d5dc427a7994fb977363634e781614f7c"},
-    {file = "grpcio-1.51.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:094e64236253590d9d4075665c77b329d707b6fca864dd62b144255e199b4f87"},
-    {file = "grpcio-1.51.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:257478300735ce3c98d65a930bbda3db172bd4e00968ba743e6a1154ea6edf10"},
-    {file = "grpcio-1.51.1-cp311-cp311-win32.whl", hash = "sha256:5a6ebcdef0ef12005d56d38be30f5156d1cb3373b52e96f147f4a24b0ddb3a9d"},
-    {file = "grpcio-1.51.1-cp311-cp311-win_amd64.whl", hash = "sha256:3f9b0023c2c92bebd1be72cdfca23004ea748be1813a66d684d49d67d836adde"},
-    {file = "grpcio-1.51.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:cd3baccea2bc5c38aeb14e5b00167bd4e2373a373a5e4d8d850bd193edad150c"},
-    {file = "grpcio-1.51.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:17ec9b13cec4a286b9e606b48191e560ca2f3bbdf3986f91e480a95d1582e1a7"},
-    {file = "grpcio-1.51.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:fbdbe9a849854fe484c00823f45b7baab159bdd4a46075302281998cb8719df5"},
-    {file = "grpcio-1.51.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:31bb6bc7ff145e2771c9baf612f4b9ebbc9605ccdc5f3ff3d5553de7fc0e0d79"},
-    {file = "grpcio-1.51.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e473525c28251558337b5c1ad3fa969511e42304524a4e404065e165b084c9e4"},
-    {file = "grpcio-1.51.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6f0b89967ee11f2b654c23b27086d88ad7bf08c0b3c2a280362f28c3698b2896"},
-    {file = "grpcio-1.51.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7942b32a291421460d6a07883033e392167d30724aa84987e6956cd15f1a21b9"},
-    {file = "grpcio-1.51.1-cp37-cp37m-win32.whl", hash = "sha256:f96ace1540223f26fbe7c4ebbf8a98e3929a6aa0290c8033d12526847b291c0f"},
-    {file = "grpcio-1.51.1-cp37-cp37m-win_amd64.whl", hash = "sha256:f1fec3abaf274cdb85bf3878167cfde5ad4a4d97c68421afda95174de85ba813"},
-    {file = "grpcio-1.51.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:0e1a9e1b4a23808f1132aa35f968cd8e659f60af3ffd6fb00bcf9a65e7db279f"},
-    {file = "grpcio-1.51.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:6df3b63538c362312bc5fa95fb965069c65c3ea91d7ce78ad9c47cab57226f54"},
-    {file = "grpcio-1.51.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:172405ca6bdfedd6054c74c62085946e45ad4d9cec9f3c42b4c9a02546c4c7e9"},
-    {file = "grpcio-1.51.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:506b9b7a4cede87d7219bfb31014d7b471cfc77157da9e820a737ec1ea4b0663"},
-    {file = "grpcio-1.51.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fb93051331acbb75b49a2a0fd9239c6ba9528f6bdc1dd400ad1cb66cf864292"},
-    {file = "grpcio-1.51.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5dca372268c6ab6372d37d6b9f9343e7e5b4bc09779f819f9470cd88b2ece3c3"},
-    {file = "grpcio-1.51.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:471d39d3370ca923a316d49c8aac66356cea708a11e647e3bdc3d0b5de4f0a40"},
-    {file = "grpcio-1.51.1-cp38-cp38-win32.whl", hash = "sha256:75e29a90dc319f0ad4d87ba6d20083615a00d8276b51512e04ad7452b5c23b04"},
-    {file = "grpcio-1.51.1-cp38-cp38-win_amd64.whl", hash = "sha256:f1158bccbb919da42544a4d3af5d9296a3358539ffa01018307337365a9a0c64"},
-    {file = "grpcio-1.51.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:59dffade859f157bcc55243714d57b286da6ae16469bf1ac0614d281b5f49b67"},
-    {file = "grpcio-1.51.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:dad6533411d033b77f5369eafe87af8583178efd4039c41d7515d3336c53b4f1"},
-    {file = "grpcio-1.51.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:4c4423ea38a7825b8fed8934d6d9aeebdf646c97e3c608c3b0bcf23616f33877"},
-    {file = "grpcio-1.51.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0dc5354e38e5adf2498312f7241b14c7ce3484eefa0082db4297189dcbe272e6"},
-    {file = "grpcio-1.51.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97d67983189e2e45550eac194d6234fc38b8c3b5396c153821f2d906ed46e0ce"},
-    {file = "grpcio-1.51.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:538d981818e49b6ed1e9c8d5e5adf29f71c4e334e7d459bf47e9b7abb3c30e09"},
-    {file = "grpcio-1.51.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9235dcd5144a83f9ca6f431bd0eccc46b90e2c22fe27b7f7d77cabb2fb515595"},
-    {file = "grpcio-1.51.1-cp39-cp39-win32.whl", hash = "sha256:aacb54f7789ede5cbf1d007637f792d3e87f1c9841f57dd51abf89337d1b8472"},
-    {file = "grpcio-1.51.1-cp39-cp39-win_amd64.whl", hash = "sha256:2b170eaf51518275c9b6b22ccb59450537c5a8555326fd96ff7391b5dd75303c"},
-    {file = "grpcio-1.51.1.tar.gz", hash = "sha256:e6dfc2b6567b1c261739b43d9c59d201c1b89e017afd9e684d85aa7a186c9f7a"},
+    {file = "grpcio-1.51.3-cp310-cp310-linux_armv7l.whl", hash = "sha256:f601aaeae18dab81930fb8d4f916b0da21e89bb4b5f7367ef793f46b4a76b7b0"},
+    {file = "grpcio-1.51.3-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:eef0450a4b5ed11feab639bf3eb1b6e23d0efa9b911bf7b06fb60e14f5f8a585"},
+    {file = "grpcio-1.51.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:82b0ad8ac825d4bb31bff9f638557c045f4a6d824d84b21e893968286f88246b"},
+    {file = "grpcio-1.51.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3667c06e37d6cd461afdd51cefe6537702f3d1dc5ff4cac07e88d8b4795dc16f"},
+    {file = "grpcio-1.51.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3709048fe0aa23dda09b3e69849a12055790171dab9e399a72ea8f9dfbf9ac80"},
+    {file = "grpcio-1.51.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:200d69857f9910f7458b39b9bcf83ee4a180591b40146ba9e49314e3a7419313"},
+    {file = "grpcio-1.51.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cd9a5e68e79c5f031500e67793048a90209711e0854a9ddee8a3ce51728de4e5"},
+    {file = "grpcio-1.51.3-cp310-cp310-win32.whl", hash = "sha256:6604f614016127ae10969176bbf12eb0e03d2fb3d643f050b3b69e160d144fb4"},
+    {file = "grpcio-1.51.3-cp310-cp310-win_amd64.whl", hash = "sha256:e95c7ccd4c5807adef1602005513bf7c7d14e5a41daebcf9d8d30d8bf51b8f81"},
+    {file = "grpcio-1.51.3-cp311-cp311-linux_armv7l.whl", hash = "sha256:5e77ee138100f0bb55cbd147840f87ee6241dbd25f09ea7cd8afe7efff323449"},
+    {file = "grpcio-1.51.3-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:68a7514b754e38e8de9075f7bb4dee919919515ec68628c43a894027e40ddec4"},
+    {file = "grpcio-1.51.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c1b9f8afa62ff265d86a4747a2990ec5a96e4efce5d5888f245a682d66eca47"},
+    {file = "grpcio-1.51.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8de30f0b417744288cec65ec8cf84b8a57995cf7f1e84ccad2704d93f05d0aae"},
+    {file = "grpcio-1.51.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b69c7adc7ed60da1cb1b502853db61f453fc745f940cbcc25eb97c99965d8f41"},
+    {file = "grpcio-1.51.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d81528ffe0e973dc840ec73a4132fd18b8203ad129d7410155d951a0a7e4f5d0"},
+    {file = "grpcio-1.51.3-cp311-cp311-win32.whl", hash = "sha256:040eb421613b57c696063abde405916dd830203c184c9000fc8c3b3b3c950325"},
+    {file = "grpcio-1.51.3-cp311-cp311-win_amd64.whl", hash = "sha256:2a8e17286c4240137d933b8ca506465472248b4ce0fe46f3404459e708b65b68"},
+    {file = "grpcio-1.51.3-cp37-cp37m-linux_armv7l.whl", hash = "sha256:d5cd1389669a847555df54177b911d9ff6f17345b2a6f19388707b7a9f724c88"},
+    {file = "grpcio-1.51.3-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:be1bf35ce82cdbcac14e39d5102d8de4079a1c1a6a06b68e41fcd9ef64f9dd28"},
+    {file = "grpcio-1.51.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:5eed34994c095e2bf7194ffac7381c6068b057ef1e69f8f08db77771350a7566"},
+    {file = "grpcio-1.51.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f9a7d88082b2a17ae7bd3c2354d13bab0453899e0851733f6afa6918373f476"},
+    {file = "grpcio-1.51.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c8abbc5f837111e7bd619612eedc223c290b0903b952ce0c7b00840ea70f14"},
+    {file = "grpcio-1.51.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:165b05af77e6aecb4210ae7663e25acf234ba78a7c1c157fa5f2efeb0d6ec53c"},
+    {file = "grpcio-1.51.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:54e36c2ee304ff15f2bfbdc43d2b56c63331c52d818c364e5b5214e5bc2ad9f6"},
+    {file = "grpcio-1.51.3-cp37-cp37m-win32.whl", hash = "sha256:cd0daac21d9ef5e033a5100c1d3aa055bbed28bfcf070b12d8058045c4e821b1"},
+    {file = "grpcio-1.51.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2fdd6333ce96435408565a9dbbd446212cd5d62e4d26f6a3c0feb1e3c35f1cc8"},
+    {file = "grpcio-1.51.3-cp38-cp38-linux_armv7l.whl", hash = "sha256:54b0c29bdd9a3b1e1b61443ab152f060fc719f1c083127ab08d03fac5efd51be"},
+    {file = "grpcio-1.51.3-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:ffaaf7e93fcb437356b5a4b23bf36e8a3d0221399ff77fd057e4bc77776a24be"},
+    {file = "grpcio-1.51.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:eafbe7501a3268d05f2e450e1ddaffb950d842a8620c13ec328b501d25d2e2c3"},
+    {file = "grpcio-1.51.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:881ecb34feabf31c6b3b9bbbddd1a5b57e69f805041e5a2c6c562a28574f71c4"},
+    {file = "grpcio-1.51.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e860a3222139b41d430939bbec2ec9c3f6c740938bf7a04471a9a8caaa965a2e"},
+    {file = "grpcio-1.51.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:49ede0528e9dac7e8a9fe30b16c73b630ddd9a576bf4b675eb6b0c53ee5ca00f"},
+    {file = "grpcio-1.51.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6972b009638b40a448d10e1bc18e2223143b8a7aa20d7def0d78dd4af4126d12"},
+    {file = "grpcio-1.51.3-cp38-cp38-win32.whl", hash = "sha256:5694448256e3cdfe5bd358f1574a3f2f51afa20cc834713c4b9788d60b7cc646"},
+    {file = "grpcio-1.51.3-cp38-cp38-win_amd64.whl", hash = "sha256:3ea4341efe603b049e8c9a5f13c696ca37fcdf8a23ca35f650428ad3606381d9"},
+    {file = "grpcio-1.51.3-cp39-cp39-linux_armv7l.whl", hash = "sha256:6c677581ce129f5fa228b8f418cee10bd28dd449f3a544ea73c8ba590ee49d0b"},
+    {file = "grpcio-1.51.3-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:30e09b5e0531685e176f49679b6a3b190762cc225f4565e55a899f5e14b3aa62"},
+    {file = "grpcio-1.51.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c831f31336e81243f85b6daff3e5e8a123302ce0ea1f2726ad752fd7a59f3aee"},
+    {file = "grpcio-1.51.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2cd2e4cefb724cab1ba2df4b7535a9980531b9ec51b4dbb5f137a1f3a3754ef0"},
+    {file = "grpcio-1.51.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7a0d0bf44438869d307f85a54f25a896ad6b4b0ca12370f76892ad732928d87"},
+    {file = "grpcio-1.51.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c02abd55409bfb293371554adf6a4401197ec2133dd97727c01180889014ba4d"},
+    {file = "grpcio-1.51.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2f8ff75e61e1227ba7a3f16b2eadbcc11d0a54096d52ab75a6b88cfbe56f55d1"},
+    {file = "grpcio-1.51.3-cp39-cp39-win32.whl", hash = "sha256:6c99a73a6260bdf844b2e5ddad02dcd530310f80e1fa72c300fa19c1c7496962"},
+    {file = "grpcio-1.51.3-cp39-cp39-win_amd64.whl", hash = "sha256:22bdfac4f7f27acdd4da359b5e7e1973dc74bf1ed406729b07d0759fde2f064b"},
+    {file = "grpcio-1.51.3.tar.gz", hash = "sha256:be7b2265b7527bb12109a7727581e274170766d5b3c9258d4e466f4872522d7a"},
 ]
 grpcio-reflection = [
-    {file = "grpcio-reflection-1.51.1.tar.gz", hash = "sha256:c07a93c0c36ef88fe475744289863b4787005eff4de0cc04213ecad718b01aae"},
-    {file = "grpcio_reflection-1.51.1-py3-none-any.whl", hash = "sha256:b70af764a83e42a44f65df1edb232e972ab69e72bc7fbbad481e66c29a9d8cb8"},
+    {file = "grpcio-reflection-1.51.3.tar.gz", hash = "sha256:5adca16f0a6cd403efa3b5f8f8a493eea6a37dee9473b178fad0a60efa68bc67"},
+    {file = "grpcio_reflection-1.51.3-py3-none-any.whl", hash = "sha256:52b037f831908468afc89c60e591d0a2bbce24a393d908c44a6d53091e90fc41"},
 ]
 grpcio-status = [
-    {file = "grpcio-status-1.51.1.tar.gz", hash = "sha256:ac2617a3095935ebd785e2228958f24b10a0d527a0c9eb5a0863c784f648a816"},
-    {file = "grpcio_status-1.51.1-py3-none-any.whl", hash = "sha256:a52cbdc4b18f325bfc13d319ae7c7ae7a0fee07f3d9a005504d6097896d7a495"},
+    {file = "grpcio-status-1.51.3.tar.gz", hash = "sha256:71792c550356ba94e162c70818719ae6d67d960bdd03a9db5ff68faba2927f6c"},
+    {file = "grpcio_status-1.51.3-py3-none-any.whl", hash = "sha256:d68d0956c16b6ea466f13c27075f126ef2cd8f0f97527d70056c64b0084357e3"},
 ]
 grpcio-tools = [
-    {file = "grpcio-tools-1.51.1.tar.gz", hash = "sha256:8e62d23d3fed9d4f81738f98dd193dbd2e21aed4a8f0dd715e75b5439e649727"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:ecf1494cb695afead36995534f787761ee33fb9e116b23030113a37fe6057a83"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:16b8b915625dc6eb2ea7efdfb06f1fae44a9066c9016453a2ca120c034f33090"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:d5e033c04b416afcddd5231b3ff94a34fb5d26fba2416eb940e69b05f22cfd25"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a218f64e667f3332b74080bdc5440aaf0fa6700ae07a0b54ecf085aaef2aa9f"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7b186183515ad6b8584ffe4bd820b72b00f6e7d121fb1c36294edeea9092313"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ccd37165d7a3e93f460096a2eb62b7a9c1ebe5c424eaee42d8e92740d0c8f6bc"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:531586c5598a99658249f3c5e92826d6d2bb117abd6ffc88527d1e1d9eaef924"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-win32.whl", hash = "sha256:392ad4cd004f7b843cf7d916d9a15b2d6585965bfef235be1c88d8f8649777e5"},
-    {file = "grpcio_tools-1.51.1-cp310-cp310-win_amd64.whl", hash = "sha256:14e82c2b3ee7e300611c2c729d411b3b911e4cca5f4ec14787457a2fb72ff9d4"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:2281180490c475d09b7aa05dabafa5e09de9902176931e7295113f636c2b5360"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:c4649af7f5d9553975ee66b6bfae20a84be779f13e163fa835e782961895e63c"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f06bb0753b7cecbff154b523cfb8f45dee2c31b0a4c72bed7da44c57f1cba113"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a671466158ed74c07ee070fb940ed783acf59ba6e6e53cb4de8fd63819c6c7f"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:048793747339f327ea091d8f022c6756d89713d8080dffde5ce7380cc348ea8e"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f6caf36e7752728329a28f93afec7c4ec9015fc1c6e4460bd1eb0f3737e1c55a"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-win32.whl", hash = "sha256:67b304282cad38642587ebae68617e450e1ad4fa1c0c8b19e9e30274dbb32716"},
-    {file = "grpcio_tools-1.51.1-cp311-cp311-win_amd64.whl", hash = "sha256:674b340f2f7bb2adbc3f15144bd37ce5ea83239f78b68dbbd0ea3cba00107e2b"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:055819992ddd30c642a7fd6f344a03747be3afa95cb910f8a2e5efaabd41cde5"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:4e3249a2ec435b3b972610c66c8a714c188844500d564c910f57a2771dc61978"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:794f26a09b70f4f101df5cf54c6c12dc1b65747ab1dee5bda02c2991389ade56"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4957f1ffa16598aa5379505fcbaeb47d65693a46b0817f4ee61db76707092aeb"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9906fb6bf6d9c30c23d85153f12d130f44325afe8f9ebe58aa7a6c82ecade9d8"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87bc5f3e3698c65907d397003c64d25c3ea84e3d6aa46dac133bd98bf66835ee"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a66b3a5d18a7615f0f828b72e2d2935751459c89cc4725e56bdfb3d2cd93281f"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-win32.whl", hash = "sha256:566809d9942e78821b279af70f3cf159a328127f9f3d5fee8d83ad8b2d27b2fe"},
-    {file = "grpcio_tools-1.51.1-cp37-cp37m-win_amd64.whl", hash = "sha256:aab24a342642329de38139cb26f8492882ca0d8551bb87f6530bcc613945a0d0"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:6b83d7fc2597c6d392c225177d1fbbcff74900f8cc40b33236987fd1ff841330"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:79c06d2577cb4d977922bbf01234de3b20f73d1784d3cbe3179deee1bdb9a60b"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:e9abc03d67793b1bf33dc766caa69a3333f9db029869ba6e8fc6cd9c251c0080"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64d8ad369417759f5fdb8ffb7cbd6374fecc06ab51c9a226dee9bbd7d311c3b5"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de51a0a71845b854f6a5967756c893c96bd03e37f39e5dce87b4f409dac36ee2"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:9dfe6c12b0e2c07f6a4a91a9912ef4e5bd007672533891a44e6f433ffbf7c3b1"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:27113b354f7587684eb55125733e6e5be1f489458abfe12344dabd918d8dcc54"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-win32.whl", hash = "sha256:98777b5031f1b3c58b688815ffa83435c103b2152c26eb144f80f4a4bb34addb"},
-    {file = "grpcio_tools-1.51.1-cp38-cp38-win_amd64.whl", hash = "sha256:1c44b57a6770b78a1eafe355878ff1ec59a2fa07455a2cbd522c071eedae04d4"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:49624394805568acd7d767dea5a00d970fca5ad8f395fe0161eeea0de5133eba"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:6d6626a6e4dbe843df96dc8c08dd244d2191a75324f54bfa4ebaa3e76b0b1958"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:b4fb8ed6d29f2d6cf03ef99ffaad635bbc132a59be77013691392fe557e67144"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8cc862a1ad30f94528d66cc6f95fb9e659005e568313e54a23550535b649573"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e72a30be1746ea0749a8486d0ca0120c0b2757fe84fc246a5144b1ef66d7b89"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:331a897306adeec3c67470431ea8d8b4972b689d32966f94506d91f4dac20952"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f336ad9be661d92fa45940e74e8ff3d78e67ebe9b4f7ea8774b2d680c17aeb6c"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-win32.whl", hash = "sha256:40ef70e8c5d0310dedff9af502b520b4c7e215bce94094527fb959150a0c594a"},
-    {file = "grpcio_tools-1.51.1-cp39-cp39-win_amd64.whl", hash = "sha256:15b8acf4eaa0ebe37e2f69108de49efd935b7abe9c7e58ba737490b99906aa76"},
+    {file = "grpcio-tools-1.51.3.tar.gz", hash = "sha256:4fea28e3dd31871579a57058796a78093c75b74b74e9de2f2b7a7fd9a443d403"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-linux_armv7l.whl", hash = "sha256:779ac1ad2258b8debaa45595bfb3814806ed8880e3ea7f194e551d76a6255969"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:83bf605fe2b3591d3c8a78646f37c72c5832c4dd84b5f92405c17cb10b136be6"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:35f885c5afd8e6a77d320f5a9624b439a93f9be2b87fa7b7948c1ad7b2ba0894"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:253b639fb79a4d28ce494ae40e5695bf1e2cb4a05f205fc433c46b2049ab4d99"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c6b145587d6062e2335f0b3286501dd6853a1ea50bd466a913351b7c48e5f20"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:046c0b1e372d4acf552aa0c8f5e830f019d67b75f25aeb0968d15fbdd3eaabd3"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc90b0287908c46281eb61933acaa1b96a575d0160fc98b5c64b9dec46f60d1"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-win32.whl", hash = "sha256:8e9df40db7a0edd403b539cc142d6114270e35debf723a5b4a7a93d5c30fffc0"},
+    {file = "grpcio_tools-1.51.3-cp310-cp310-win_amd64.whl", hash = "sha256:077adaee431c2b040dd77923964577087c32e828908e8fa2e53f8e003ad408c9"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-linux_armv7l.whl", hash = "sha256:b50f9b8a6482a90c1a41e731a879a130f7dea267065d0a06f47c9160ce5d01c3"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:89a68adcb4238aba69f3a364ac02c9a46e55b9e3fd8af1c6f384079abfa9347c"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d177da43e7f6fde6715df4a3015ae13158166bc2845ac7f9cfb526eafb41b8"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:793f9edef82f600a3324f8a3d8cd8318a8d02f28fb54f8236cbb35ce0928d186"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f7583735542ced7d30baec6cc21bffeaffcec1523bf807e8f8f0047113b6d30a"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f2df233a3e7db23d9b516cb5e2bfe029465f40a72978bee0584e44e7860ea73f"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-win32.whl", hash = "sha256:7427939455735fbf2ea88c37f1585c9c8b809eec7b447642f34465eb4d26020b"},
+    {file = "grpcio_tools-1.51.3-cp311-cp311-win_amd64.whl", hash = "sha256:ba76d15fd149b575170fa32a1f6a9ff2b38ff9db223229a8ad6f53450a452688"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-linux_armv7l.whl", hash = "sha256:d2212c682529263b3c9e903092d0ccbb9fc6afba820e4c2fa52c2c27720cdcae"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:405656b3cf9639427e6c30a795570cba4a7c06b88a3145866f7d2c05b7e048b4"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:3c445a064b2ef3d3475e26e2add8ddb4ac2933741ecddf71d5b071a3ad078db4"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7b3374f4a6579c58d16a5fab2e6b4e9bb8625a034a7f4cd6024f4d1cc12f2a0"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e8df08b65f9379c3f103147b29542b0141ca84e77d0eee9114ca5f9b3f0d23"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2fade12de08923b350475ca16d0d0bd68578c30fce89147aa0f94ef5759bc5a9"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4ffb6325ed489065dbdca764cf37c3a29376bc657874116c9af788d7a0d2ee4"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-win32.whl", hash = "sha256:f8d17271fc58ed3503dd571c79917e126deca51f85f093770a9606e806aac9dc"},
+    {file = "grpcio_tools-1.51.3-cp37-cp37m-win_amd64.whl", hash = "sha256:ef849687c7f2bd7f3277edc7c7cafc7042823d0fb078e3c01c861eb0c96ed181"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-linux_armv7l.whl", hash = "sha256:7fd18d8d211fbfd337fc12e5bdd57e62368f636addf901d290e68a39f1dfea38"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:233fc56f054424232e2086f444004413e33c699174ce6ee0e279c25227243fec"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:867fa1973fa8b0772077c15425f122f672a18b1c53709a8a2bff9d056db4c20e"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b486a99bdf2722e68a9d59769389e2fb86878b6f293be5111f7678e364a0c359"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8bbf412c357999f88d87f421fd48b4b114fc037fec7bbaed0cb7620c24a5e44"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1166744c40821bb0aa605d2af2287fac367756f858a3d18f4c3d25bc0b92757b"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:781896c488e07b9463196045e6725e52d018cd7d0e1062d4ab1eee2647ca9170"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-win32.whl", hash = "sha256:35c1ee7c766eb586f04ba41fa7711eb847767eb277a1737998374ac57768f1f0"},
+    {file = "grpcio_tools-1.51.3-cp38-cp38-win_amd64.whl", hash = "sha256:584b201fb39307dcb1affcf2647656a0e6244423ef1659cc6caa3ff85c5ae5c1"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-linux_armv7l.whl", hash = "sha256:e02231e21029f716a1d23a0b5e664fa243d147da33a3f55088a9529b860aa4ac"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:fbb742e10bd548031b8d80f7c28eb70c7c3a9850f8e99c98cd496f19a05f9fee"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:a836a72c657f751244cdb358c3461a89627e6d02654079d2450cfe361800428c"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bb554408e0ec5ff5201013f268726d9eef8e5bd1fd4b4e09c46c0b4a9de8b64c"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:158c5bfe7e157fd9a944bde9f7dfe3b468416666e4fade77cd17caa3edc8bd81"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:715c792679224171c0584e9f235b921d76f8990deb38b0d1215d0469301d9cd9"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ece44f42b10e0bceb49235be1e361e1ee69afee7f938c82fb656a601a4a720e3"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-win32.whl", hash = "sha256:980e632710ba05e04364c6f276e905d5d367437f1ce2265ce7b96b5c1eac5693"},
+    {file = "grpcio_tools-1.51.3-cp39-cp39-win_amd64.whl", hash = "sha256:5f4c47b14e66f80365cd5667ecc2f7fb0eb91e02c4e54362041b758feaa00511"},
+]
+hf-transfer = [
+    {file = "hf_transfer-0.1.2-cp310-cp310-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:2b9189a4a460646ee135ee771f39c0f695d3d5bf08b7ff1dcfe374227520e994"},
+    {file = "hf_transfer-0.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:654fcaba4e7084caa1e97430982ea968935a72916ee0f4afc60e356f89774099"},
+    {file = "hf_transfer-0.1.2-cp310-none-win_amd64.whl", hash = "sha256:eb29e7b3707b5cac02e689c89111685ebcdaa3cebba02eb7ac1b0f076357da72"},
+    {file = "hf_transfer-0.1.2-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:0bfca9bd84e925e978a0f157df488704c17a0b9ad240b2859262faba0c74cd40"},
+    {file = "hf_transfer-0.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d00c5473b35227b2f113fd43ff13cbac9539f2e6779fa0680a887b0aac31c389"},
+    {file = "hf_transfer-0.1.2-cp311-none-win_amd64.whl", hash = "sha256:1aaf5937aa433b7d09ce5bf60967ec22b7d3982957b00516a8dc2aaa66384372"},
+    {file = "hf_transfer-0.1.2-cp37-cp37m-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:b0aa760a55995ad59ea17e395babafdc56c4e664be0c2d2055664199dd913da1"},
+    {file = "hf_transfer-0.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:889dd15e8472daf66e266eb056e31a485af3c35f95a483bb43489a0f6e44c359"},
+    {file = "hf_transfer-0.1.2-cp37-none-win_amd64.whl", hash = "sha256:30df586e18ec8a8e67e3201b9038210d94cb3c03c1cbd97673b9c78ede227178"},
+    {file = "hf_transfer-0.1.2-cp38-cp38-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:cc97eb97f929f96bed896cd3af9bbdf121c15ac6d63524b9fc9312fd2929099a"},
+    {file = "hf_transfer-0.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:583c2c80210a60dafed9a81ba50c389878aee6c34b2dd375cd84522658f29ad8"},
+    {file = "hf_transfer-0.1.2-cp38-none-win_amd64.whl", hash = "sha256:6dff58f50d1435b0346f31a32f1f9e2301986521c1d0b51e47a3c82b96d02156"},
+    {file = "hf_transfer-0.1.2-cp39-cp39-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:d6db1a8f539133f7a893bb32721916fe72b4d2aa3eb7604581ba1f03b8167c90"},
+    {file = "hf_transfer-0.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f284e3f775d215c9a8d3d1c6f6b1001b1e7990d73ae5fd9aea6c9bce9ea79285"},
+    {file = "hf_transfer-0.1.2-cp39-none-win_amd64.whl", hash = "sha256:8625beabebc582eafc4141a5ecb9f1183b728d4f63767f01fdcf1e2fbafe6d43"},
+    {file = "hf_transfer-0.1.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:947dd1b8b22ac10723b2887ed4b5ef929f7d4dd850b0d66c0c6954a9a85afb06"},
+    {file = "hf_transfer-0.1.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90a020f41dfae4629186c284888cd5adbebe402e2497a88351416ab93c7df9a8"},
+    {file = "hf_transfer-0.1.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5eb89698746a29805bfc60126b9a008e6ba08a82ef9bb122a6544e84f748e8a4"},
+    {file = "hf_transfer-0.1.2.tar.gz", hash = "sha256:6bf847f4c19c7d8d9f9bbb8a7ed52e1271bbf0c1bd920357db0c274ccc69f21d"},
 ]
 idna = [
     {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
@@ -965,20 +994,19 @@ pluggy = [
     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
 ]
 protobuf = [
-    {file = "protobuf-4.21.12-cp310-abi3-win32.whl", hash = "sha256:b135410244ebe777db80298297a97fbb4c862c881b4403b71bac9d4107d61fd1"},
-    {file = "protobuf-4.21.12-cp310-abi3-win_amd64.whl", hash = "sha256:89f9149e4a0169cddfc44c74f230d7743002e3aa0b9472d8c28f0388102fc4c2"},
-    {file = "protobuf-4.21.12-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:299ea899484ee6f44604deb71f424234f654606b983cb496ea2a53e3c63ab791"},
-    {file = "protobuf-4.21.12-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:d1736130bce8cf131ac7957fa26880ca19227d4ad68b4888b3be0dea1f95df97"},
-    {file = "protobuf-4.21.12-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:78a28c9fa223998472886c77042e9b9afb6fe4242bd2a2a5aced88e3f4422aa7"},
-    {file = "protobuf-4.21.12-cp37-cp37m-win32.whl", hash = "sha256:3d164928ff0727d97022957c2b849250ca0e64777ee31efd7d6de2e07c494717"},
-    {file = "protobuf-4.21.12-cp37-cp37m-win_amd64.whl", hash = "sha256:f45460f9ee70a0ec1b6694c6e4e348ad2019275680bd68a1d9314b8c7e01e574"},
-    {file = "protobuf-4.21.12-cp38-cp38-win32.whl", hash = "sha256:6ab80df09e3208f742c98443b6166bcb70d65f52cfeb67357d52032ea1ae9bec"},
-    {file = "protobuf-4.21.12-cp38-cp38-win_amd64.whl", hash = "sha256:1f22ac0ca65bb70a876060d96d914dae09ac98d114294f77584b0d2644fa9c30"},
-    {file = "protobuf-4.21.12-cp39-cp39-win32.whl", hash = "sha256:27f4d15021da6d2b706ddc3860fac0a5ddaba34ab679dc182b60a8bb4e1121cc"},
-    {file = "protobuf-4.21.12-cp39-cp39-win_amd64.whl", hash = "sha256:237216c3326d46808a9f7c26fd1bd4b20015fb6867dc5d263a493ef9a539293b"},
-    {file = "protobuf-4.21.12-py2.py3-none-any.whl", hash = "sha256:a53fd3f03e578553623272dc46ac2f189de23862e68565e83dde203d41b76fc5"},
-    {file = "protobuf-4.21.12-py3-none-any.whl", hash = "sha256:b98d0148f84e3a3c569e19f52103ca1feacdac0d2df8d6533cf983d1fda28462"},
-    {file = "protobuf-4.21.12.tar.gz", hash = "sha256:7cd532c4566d0e6feafecc1059d04c7915aec8e182d1cf7adee8b24ef1e2e6ab"},
+    {file = "protobuf-4.22.0-cp310-abi3-win32.whl", hash = "sha256:b2fea9dc8e3c0f32c38124790ef16cba2ee0628fe2022a52e435e1117bfef9b1"},
+    {file = "protobuf-4.22.0-cp310-abi3-win_amd64.whl", hash = "sha256:a33a273d21852f911b8bda47f39f4383fe7c061eb1814db2c76c9875c89c2491"},
+    {file = "protobuf-4.22.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:e894e9ae603e963f0842498c4cd5d39c6a60f0d7e4c103df50ee939564298658"},
+    {file = "protobuf-4.22.0-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:7c535d126e7dcc714105ab20b418c4fedbd28f8b8afc42b7350b1e317bbbcc71"},
+    {file = "protobuf-4.22.0-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:86c3d20428b007537ba6792b475c0853bba7f66b1f60e610d913b77d94b486e4"},
+    {file = "protobuf-4.22.0-cp37-cp37m-win32.whl", hash = "sha256:1669cb7524221a8e2d9008d0842453dbefdd0fcdd64d67672f657244867635fb"},
+    {file = "protobuf-4.22.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ab4d043865dd04e6b09386981fe8f80b39a1e46139fb4a3c206229d6b9f36ff6"},
+    {file = "protobuf-4.22.0-cp38-cp38-win32.whl", hash = "sha256:29288813aacaa302afa2381db1d6e0482165737b0afdf2811df5fa99185c457b"},
+    {file = "protobuf-4.22.0-cp38-cp38-win_amd64.whl", hash = "sha256:e474b63bab0a2ea32a7b26a4d8eec59e33e709321e5e16fb66e766b61b82a95e"},
+    {file = "protobuf-4.22.0-cp39-cp39-win32.whl", hash = "sha256:47d31bdf58222dd296976aa1646c68c6ee80b96d22e0a3c336c9174e253fd35e"},
+    {file = "protobuf-4.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:c27f371f0159feb70e6ea52ed7e768b3f3a4c5676c1900a7e51a24740381650e"},
+    {file = "protobuf-4.22.0-py3-none-any.whl", hash = "sha256:c3325803095fb4c2a48649c321d2fbde59f8fbfcb9bfc7a86df27d112831c571"},
+    {file = "protobuf-4.22.0.tar.gz", hash = "sha256:652d8dfece122a24d98eebfef30e31e455d300efa41999d1182e015984ac5930"},
 ]
 psutil = [
     {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"},
@@ -1089,8 +1117,8 @@ safetensors = [
     {file = "safetensors-0.2.8.tar.gz", hash = "sha256:2720b20a6a38c799dca79bd76caeeac2f7df585a9d4f7d59fa7e28eff9ccb27f"},
 ]
 setuptools = [
-    {file = "setuptools-67.2.0-py3-none-any.whl", hash = "sha256:16ccf598aab3b506593c17378473978908a2734d7336755a8769b480906bec1c"},
-    {file = "setuptools-67.2.0.tar.gz", hash = "sha256:b440ee5f7e607bb8c9de15259dba2583dd41a38879a7abc1d43a71c59524da48"},
+    {file = "setuptools-67.4.0-py3-none-any.whl", hash = "sha256:f106dee1b506dee5102cc3f3e9e68137bbad6d47b616be7991714b0c62204251"},
+    {file = "setuptools-67.4.0.tar.gz", hash = "sha256:e5fd0a713141a4a105412233c63dc4e17ba0090c8e8334594ac790ec97792330"},
 ]
 tomli = [
     {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
@@ -1124,8 +1152,8 @@ typer = [
     {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"},
 ]
 typing-extensions = [
-    {file = "typing_extensions-4.4.0-py3-none-any.whl", hash = "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"},
-    {file = "typing_extensions-4.4.0.tar.gz", hash = "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa"},
+    {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"},
+    {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"},
 ]
 urllib3 = [
     {file = "urllib3-1.26.14-py2.py3-none-any.whl", hash = "sha256:75edcdc2f7d85b137124a6c3c9fc3933cdeaa12ecb9a6a959f22797a0feca7e1"},
@@ -1140,68 +1168,79 @@ win32-setctime = [
     {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
 ]
 wrapt = [
-    {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"},
-    {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"},
-    {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28"},
-    {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59"},
-    {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87"},
-    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1"},
-    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b"},
-    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462"},
-    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1"},
-    {file = "wrapt-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320"},
-    {file = "wrapt-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2"},
-    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4"},
-    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069"},
-    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310"},
-    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f"},
-    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656"},
-    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
-    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
-    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
-    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
-    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
-    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d"},
-    {file = "wrapt-1.14.1-cp35-cp35m-win32.whl", hash = "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7"},
-    {file = "wrapt-1.14.1-cp35-cp35m-win_amd64.whl", hash = "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00"},
-    {file = "wrapt-1.14.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4"},
-    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1"},
-    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1"},
-    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff"},
-    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d"},
-    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1"},
-    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569"},
-    {file = "wrapt-1.14.1-cp36-cp36m-win32.whl", hash = "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed"},
-    {file = "wrapt-1.14.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471"},
-    {file = "wrapt-1.14.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248"},
-    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68"},
-    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d"},
-    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77"},
-    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7"},
-    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015"},
-    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a"},
-    {file = "wrapt-1.14.1-cp37-cp37m-win32.whl", hash = "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853"},
-    {file = "wrapt-1.14.1-cp37-cp37m-win_amd64.whl", hash = "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c"},
-    {file = "wrapt-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456"},
-    {file = "wrapt-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f"},
-    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc"},
-    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1"},
-    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"},
-    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b"},
-    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0"},
-    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57"},
-    {file = "wrapt-1.14.1-cp38-cp38-win32.whl", hash = "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5"},
-    {file = "wrapt-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d"},
-    {file = "wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383"},
-    {file = "wrapt-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7"},
-    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86"},
-    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735"},
-    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b"},
-    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3"},
-    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3"},
-    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe"},
-    {file = "wrapt-1.14.1-cp39-cp39-win32.whl", hash = "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5"},
-    {file = "wrapt-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb"},
-    {file = "wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d"},
+    {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a"},
+    {file = "wrapt-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923"},
+    {file = "wrapt-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee"},
+    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727"},
+    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7"},
+    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0"},
+    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec"},
+    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90"},
+    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975"},
+    {file = "wrapt-1.15.0-cp310-cp310-win32.whl", hash = "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1"},
+    {file = "wrapt-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e"},
+    {file = "wrapt-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7"},
+    {file = "wrapt-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72"},
+    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb"},
+    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e"},
+    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c"},
+    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3"},
+    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92"},
+    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98"},
+    {file = "wrapt-1.15.0-cp311-cp311-win32.whl", hash = "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416"},
+    {file = "wrapt-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248"},
+    {file = "wrapt-1.15.0-cp35-cp35m-win32.whl", hash = "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559"},
+    {file = "wrapt-1.15.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"},
+    {file = "wrapt-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba"},
+    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752"},
+    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364"},
+    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475"},
+    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8"},
+    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418"},
+    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2"},
+    {file = "wrapt-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1"},
+    {file = "wrapt-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420"},
+    {file = "wrapt-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317"},
+    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e"},
+    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e"},
+    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0"},
+    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019"},
+    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034"},
+    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653"},
+    {file = "wrapt-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0"},
+    {file = "wrapt-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e"},
+    {file = "wrapt-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145"},
+    {file = "wrapt-1.15.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f"},
+    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd"},
+    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b"},
+    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f"},
+    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6"},
+    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094"},
+    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7"},
+    {file = "wrapt-1.15.0-cp38-cp38-win32.whl", hash = "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b"},
+    {file = "wrapt-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1"},
+    {file = "wrapt-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86"},
+    {file = "wrapt-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c"},
+    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d"},
+    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc"},
+    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29"},
+    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a"},
+    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8"},
+    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9"},
+    {file = "wrapt-1.15.0-cp39-cp39-win32.whl", hash = "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff"},
+    {file = "wrapt-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6"},
+    {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"},
+    {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"},
 ]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index d3a8c112..63a3df6c 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -1,11 +1,11 @@
 [tool.poetry]
-name = "text-generation"
-version = "0.2.1"
+name = "text-generation-server"
+version = "0.4.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 
 [tool.poetry.scripts]
-text-generation-server = 'text_generation.cli:app'
+text-generation-server = 'text_generation_server.cli:app'
 
 [tool.poetry.dependencies]
 python = "^3.9"
@@ -22,6 +22,7 @@ loguru = "^0.6.0"
 opentelemetry-api = "^1.15.0"
 opentelemetry-exporter-otlp = "^1.15.0"
 opentelemetry-instrumentation-grpc = "^0.36b0"
+hf-transfer = "^0.1.2"
 
 [tool.poetry.extras]
 bnb = ["bitsandbytes"]
diff --git a/server/tests/conftest.py b/server/tests/conftest.py
index 9fae8ee1..16d2c408 100644
--- a/server/tests/conftest.py
+++ b/server/tests/conftest.py
@@ -1,6 +1,6 @@
 import pytest
 
-from text_generation.pb import generate_pb2
+from text_generation_server.pb import generate_pb2
 
 
 @pytest.fixture
@@ -10,6 +10,7 @@ def default_pb_parameters():
         repetition_penalty=1.0,
         top_k=0,
         top_p=1.0,
+        typical_p=1.0,
         do_sample=False,
     )
 
diff --git a/server/tests/models/test_bloom.py b/server/tests/models/test_bloom.py
index b06d57f5..2b8ef5f8 100644
--- a/server/tests/models/test_bloom.py
+++ b/server/tests/models/test_bloom.py
@@ -4,9 +4,9 @@ import torch
 from copy import copy
 from transformers import AutoTokenizer
 
-from text_generation.pb import generate_pb2
-from text_generation.models.causal_lm import CausalLMBatch
-from text_generation.models.bloom import BloomCausalLMBatch, BLOOM
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOM
 
 
 @pytest.fixture(scope="session")
@@ -24,7 +24,6 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="Test",
-        input_length=1,
         parameters=default_pb_parameters,
         stopping_parameters=default_pb_stop_parameters,
     )
@@ -65,8 +64,8 @@ def test_batch_from_pb(default_pb_batch, default_bloom_batch):
     assert batch.input_ids[0][-1] == 10264
     assert torch.all(batch.input_ids[0][:-1] == 3)
 
-    assert batch.attention_mask[0][-1] == 1
-    assert torch.all(batch.attention_mask[0][:-1] == 0)
+    assert batch.attention_mask[0][0] == 1
+    assert torch.all(batch.attention_mask[0][1:] == 0)
 
     assert batch.past_key_values is None
 
@@ -77,7 +76,7 @@ def test_batch_from_pb(default_pb_batch, default_bloom_batch):
     assert batch.size == default_pb_batch.size
     assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == batch.size
 
-    assert batch.max_sequence_length == batch.input_lengths[0]
+    assert batch.max_input_length == batch.input_lengths[0]
 
 
 def test_batch_concatenate_no_prefill(default_bloom_batch):
@@ -98,22 +97,19 @@ def test_causal_lm_generate_token(default_bloom, default_bloom_batch):
     assert not next_batch.keys_head_dim_last
 
     assert len(next_batch.all_input_ids) == next_batch.size
-    assert (
-        len(next_batch.all_input_ids[0])
-        == len(next_batch.attention_mask[0])
-        == sequence_length + 1
-    )
+    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
+    assert len(next_batch.attention_mask[0]) == 11
     assert torch.all(next_batch.all_input_ids[0][-2:] == 10264)
     assert torch.all(next_batch.all_input_ids[0][:-2] == 3)
 
-    assert torch.all(next_batch.attention_mask[0][-2:] == 1)
-    assert torch.all(next_batch.attention_mask[0][:-2] == 0)
+    assert torch.all(next_batch.attention_mask[0][:2] == 1)
+    assert torch.all(next_batch.attention_mask[0][2:] == 0)
 
     assert next_batch.input_ids.shape == (next_batch.size, 1)
     assert next_batch.input_ids[0, 0] == 10264
 
     assert next_batch.input_lengths == [2]
-    assert next_batch.max_sequence_length == next_batch.input_lengths[0]
+    assert next_batch.max_input_length == next_batch.input_lengths[0]
 
     assert next_batch.past_key_values is not None
     assert all(
@@ -213,15 +209,19 @@ def test_batch_concatenate(
     assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
     assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
 
-    assert torch.all(next_batch.attention_mask[0] == 1)
-    assert torch.all(next_batch.attention_mask[1:, -2:] == 1)
-    assert torch.all(next_batch.attention_mask[1:, :-2] == 0)
+    assert torch.all(
+        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(
+        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
 
     assert next_batch.batch_id == 0
     assert torch.all(next_batch.input_ids == 10264)
 
     assert next_batch.input_lengths == [3, 2, 2]
-    assert next_batch.max_sequence_length == 3
+    assert next_batch.max_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
     assert next_batch.requests[1:] == next_batch_1.requests
diff --git a/server/tests/models/test_causal_lm.py b/server/tests/models/test_causal_lm.py
index 6a822815..76617b62 100644
--- a/server/tests/models/test_causal_lm.py
+++ b/server/tests/models/test_causal_lm.py
@@ -4,8 +4,8 @@ import torch
 from copy import copy
 from transformers import AutoTokenizer
 
-from text_generation.pb import generate_pb2
-from text_generation.models.causal_lm import CausalLM, CausalLMBatch
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
 
 
 @pytest.fixture(scope="session")
@@ -25,7 +25,6 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="Test",
-        input_length=1,
         parameters=default_pb_parameters,
         stopping_parameters=default_pb_stop_parameters,
     )
@@ -62,8 +61,8 @@ def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
     assert batch.input_ids[0][-1] == 14402
     assert torch.all(batch.input_ids[0][:-1] == 50256)
 
-    assert batch.attention_mask[0][-1] == 1
-    assert torch.all(batch.attention_mask[0][:-1] == 0)
+    assert batch.attention_mask[0, 0] == 1
+    assert torch.all(batch.attention_mask[0, 1:] == 0)
 
     assert batch.past_key_values is None
 
@@ -74,7 +73,7 @@ def test_batch_from_pb(default_pb_batch, default_causal_lm_batch):
     assert batch.size == default_pb_batch.size
     assert len(batch.next_token_choosers) == len(batch.stopping_criterias) == batch.size
 
-    assert batch.max_sequence_length == batch.input_lengths[0]
+    assert batch.max_input_length == batch.input_lengths[0]
 
 
 def test_batch_concatenate_no_prefill(default_causal_lm_batch):
@@ -94,23 +93,20 @@ def test_causal_lm_generate_token(default_causal_lm, default_causal_lm_batch):
     assert isinstance(next_batch, CausalLMBatch)
 
     assert len(next_batch.all_input_ids) == next_batch.size
-    assert (
-        len(next_batch.all_input_ids[0])
-        == len(next_batch.attention_mask[0])
-        == sequence_length + 1
-    )
+    assert len(next_batch.all_input_ids[0]) == sequence_length + 1
+    assert len(next_batch.attention_mask[0]) == 11
     assert next_batch.all_input_ids[0][-1] == 13
     assert next_batch.all_input_ids[0][-2] == 14402
     assert torch.all(next_batch.all_input_ids[0][:-2] == 50256)
 
-    assert torch.all(next_batch.attention_mask[0][-2:] == 1)
-    assert torch.all(next_batch.attention_mask[0][:-2] == 0)
+    assert torch.all(next_batch.attention_mask[0][0:2] == 1)
+    assert torch.all(next_batch.attention_mask[0][2:] == 0)
 
     assert next_batch.input_ids.shape == (next_batch.size, 1)
     assert next_batch.input_ids[0, 0] == 13
 
     assert next_batch.input_lengths == [2]
-    assert next_batch.max_sequence_length == next_batch.input_lengths[0]
+    assert next_batch.max_input_length == next_batch.input_lengths[0]
 
     assert next_batch.past_key_values is not None
     assert all(
@@ -210,16 +206,20 @@ def test_batch_concatenate(
     assert torch.equal(next_batch.all_input_ids[1], next_batch_1.all_input_ids[0])
     assert torch.equal(next_batch.all_input_ids[2], next_batch_1.all_input_ids[1])
 
-    assert torch.all(next_batch.attention_mask[0] == 1)
-    assert torch.all(next_batch.attention_mask[1:, -2:] == 1)
-    assert torch.all(next_batch.attention_mask[1:, :-2] == 0)
+    assert torch.all(
+        next_batch.attention_mask[0, : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(
+        next_batch.attention_mask[1:, 1 : -next_batch.padding_right_offset] == 1
+    )
+    assert torch.all(next_batch.attention_mask[1:, 3:] == 0)
 
     assert next_batch.batch_id == 0
     assert next_batch.input_ids[0, 0] == 12355
     assert torch.all(next_batch.input_ids[1:] == 13)
 
     assert next_batch.input_lengths == [3, 2, 2]
-    assert next_batch.max_sequence_length == 3
+    assert next_batch.max_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
     assert next_batch.requests[1:] == next_batch_1.requests
diff --git a/server/tests/models/test_santacoder.py b/server/tests/models/test_santacoder.py
index 1596e413..753ff5fc 100644
--- a/server/tests/models/test_santacoder.py
+++ b/server/tests/models/test_santacoder.py
@@ -1,8 +1,8 @@
 import pytest
 
-from text_generation.pb import generate_pb2
-from text_generation.models.causal_lm import CausalLMBatch
-from text_generation.models.santacoder import SantaCoder
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.models.santacoder import SantaCoder
 
 
 @pytest.fixture(scope="session")
@@ -15,7 +15,6 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="def",
-        input_length=1,
         parameters=default_pb_parameters,
         stopping_parameters=default_pb_stop_parameters,
     )
@@ -31,7 +30,6 @@ def default_fim_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="<fim-prefix>def<fim-suffix>world<fim-middle>",
-        input_length=5,
         parameters=default_pb_parameters,
         stopping_parameters=default_pb_stop_parameters,
     )
diff --git a/server/tests/models/test_seq2seq_lm.py b/server/tests/models/test_seq2seq_lm.py
index 22c6ac9c..2d86c44b 100644
--- a/server/tests/models/test_seq2seq_lm.py
+++ b/server/tests/models/test_seq2seq_lm.py
@@ -5,8 +5,8 @@ from copy import copy
 
 from transformers import AutoTokenizer
 
-from text_generation.pb import generate_pb2
-from text_generation.models.seq2seq_lm import Seq2SeqLM, Seq2SeqLMBatch
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.seq2seq_lm import Seq2SeqLM, Seq2SeqLMBatch
 
 
 @pytest.fixture(scope="session")
@@ -28,7 +28,6 @@ def default_pb_request(default_pb_parameters, default_pb_stop_parameters):
     return generate_pb2.Request(
         id=0,
         inputs="Test",
-        input_length=2,
         parameters=default_pb_parameters,
         stopping_parameters=default_pb_stop_parameters,
     )
@@ -106,7 +105,7 @@ def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch)
     assert len(generations) == len(next_batch)
     assert isinstance(next_batch, Seq2SeqLMBatch)
 
-    assert torch.equal(next_batch.input_ids, default_seq2seq_lm_batch.input_ids)
+    assert next_batch.input_ids is None
     assert torch.equal(
         next_batch.attention_mask, default_seq2seq_lm_batch.attention_mask
     )
@@ -148,7 +147,7 @@ def test_seq2seq_lm_generate_token(default_seq2seq_lm, default_seq2seq_lm_batch)
     assert all([generation.generated_text is None for generation in generations])
     assert all([len(generation.prefill_tokens) == 1 for generation in generations])
     assert all([generation.token_id.item() == 259 for generation in generations])
-    assert all([generation.token_text == "" for generation in generations])
+    assert all([generation.token_text == " " for generation in generations])
     assert generations[0].request_id == 0
 
 
@@ -220,11 +219,6 @@ def test_batch_concatenate(
 
     assert next_batch.batch_id == 0
 
-    assert torch.all(next_batch.input_ids[:, 0] == 4268)
-    assert torch.all(next_batch.input_ids[:, 1] == 1)
-
-    assert torch.all(next_batch.attention_mask == 1)
-
     assert torch.equal(
         next_batch.decoder_input_ids[0], next_batch_0.decoder_input_ids[0]
     )
@@ -233,9 +227,10 @@ def test_batch_concatenate(
         next_batch.decoder_input_ids[1:, -2:], next_batch_1.decoder_input_ids
     )
 
-    assert torch.all(next_batch.decoder_attention_mask[0] == 1)
+    assert torch.all(next_batch.decoder_attention_mask[0, :3] == 1)
+    assert torch.all(next_batch.decoder_attention_mask[0, 3:] == 0)
     assert torch.all(next_batch.decoder_attention_mask[1:, 0] == 0)
-    assert torch.all(next_batch.decoder_attention_mask[1:, -2:] == 1)
+    assert torch.all(next_batch.decoder_attention_mask[1:, 1:3] == 1)
 
     assert torch.equal(
         next_batch.encoder_last_hidden_state[0],
diff --git a/server/tests/utils/test_convert.py b/server/tests/utils/test_convert.py
new file mode 100644
index 00000000..7dfe6a1e
--- /dev/null
+++ b/server/tests/utils/test_convert.py
@@ -0,0 +1,21 @@
+from text_generation_server.utils.hub import (
+    download_weights,
+    weight_hub_files,
+    weight_files,
+)
+
+from text_generation_server.utils.convert import convert_files
+
+
+def test_convert_files():
+    model_id = "bigscience/bloom-560m"
+    pt_filenames = weight_hub_files(model_id, extension=".bin")
+    local_pt_files = download_weights(pt_filenames, model_id)
+    local_st_files = [
+        p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors" for p in local_pt_files
+    ]
+    convert_files(local_pt_files, local_st_files)
+
+    found_st_files = weight_files(model_id)
+
+    assert all([p in found_st_files for p in local_st_files])
diff --git a/server/tests/utils/test_hub.py b/server/tests/utils/test_hub.py
new file mode 100644
index 00000000..fac9a64d
--- /dev/null
+++ b/server/tests/utils/test_hub.py
@@ -0,0 +1,40 @@
+import pytest
+
+from text_generation_server.utils.hub import (
+    weight_hub_files,
+    download_weights,
+    weight_files,
+    EntryNotFoundError,
+    LocalEntryNotFoundError,
+    RevisionNotFoundError,
+)
+
+
+def test_weight_hub_files():
+    filenames = weight_hub_files("bigscience/bloom-560m")
+    assert filenames == ["model.safetensors"]
+
+
+def test_weight_hub_files_llm():
+    filenames = weight_hub_files("bigscience/bloom")
+    assert filenames == [f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)]
+
+
+def test_weight_hub_files_empty():
+    with pytest.raises(EntryNotFoundError):
+        weight_hub_files("bigscience/bloom", extension=".errors")
+
+
+def test_download_weights():
+    model_id = "bigscience/bloom-560m"
+    filenames = weight_hub_files(model_id)
+    files = download_weights(filenames, model_id)
+    local_files = weight_files("bigscience/bloom-560m")
+    assert files == local_files
+
+
+def test_weight_files_error():
+    with pytest.raises(RevisionNotFoundError):
+        weight_files("bigscience/bloom-560m", revision="error")
+    with pytest.raises(LocalEntryNotFoundError):
+        weight_files("bert-base-uncased")
diff --git a/server/tests/test_utils.py b/server/tests/utils/test_tokens.py
similarity index 52%
rename from server/tests/test_utils.py
rename to server/tests/utils/test_tokens.py
index ffe9be65..3883ad97 100644
--- a/server/tests/test_utils.py
+++ b/server/tests/utils/test_tokens.py
@@ -1,14 +1,6 @@
-import pytest
-
-from huggingface_hub.utils import RevisionNotFoundError
-
-from text_generation.utils import (
-    weight_hub_files,
-    download_weights,
-    weight_files,
+from text_generation_server.utils.tokens import (
     StopSequenceCriteria,
     StoppingCriteria,
-    LocalEntryNotFoundError,
     FinishReason,
 )
 
@@ -41,31 +33,3 @@ def test_stopping_criteria_max():
     assert criteria(1, "") == (False, None)
     assert criteria(1, "") == (False, None)
     assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
-
-
-def test_weight_hub_files():
-    filenames = weight_hub_files("bigscience/bloom-560m")
-    assert filenames == ["model.safetensors"]
-
-
-def test_weight_hub_files_llm():
-    filenames = weight_hub_files("bigscience/bloom")
-    assert filenames == [f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)]
-
-
-def test_weight_hub_files_empty():
-    filenames = weight_hub_files("bigscience/bloom", extension=".errors")
-    assert filenames == []
-
-
-def test_download_weights():
-    files = download_weights("bigscience/bloom-560m")
-    local_files = weight_files("bigscience/bloom-560m")
-    assert files == local_files
-
-
-def test_weight_files_error():
-    with pytest.raises(RevisionNotFoundError):
-        weight_files("bigscience/bloom-560m", revision="error")
-    with pytest.raises(LocalEntryNotFoundError):
-        weight_files("bert-base-uncased")
diff --git a/server/text_generation/cli.py b/server/text_generation/cli.py
deleted file mode 100644
index e9c8ea92..00000000
--- a/server/text_generation/cli.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-import sys
-import typer
-
-from pathlib import Path
-from loguru import logger
-from typing import Optional
-
-from text_generation import server, utils
-from text_generation.tracing import setup_tracing
-
-app = typer.Typer()
-
-
-@app.command()
-def serve(
-    model_id: str,
-    revision: Optional[str] = None,
-    sharded: bool = False,
-    quantize: bool = False,
-    uds_path: Path = "/tmp/text-generation",
-    logger_level: str = "INFO",
-    json_output: bool = False,
-    otlp_endpoint: Optional[str] = None,
-):
-    if sharded:
-        assert (
-            os.getenv("RANK", None) is not None
-        ), "RANK must be set when sharded is True"
-        assert (
-            os.getenv("WORLD_SIZE", None) is not None
-        ), "WORLD_SIZE must be set when sharded is True"
-        assert (
-            os.getenv("MASTER_ADDR", None) is not None
-        ), "MASTER_ADDR must be set when sharded is True"
-        assert (
-            os.getenv("MASTER_PORT", None) is not None
-        ), "MASTER_PORT must be set when sharded is True"
-
-    # Remove default handler
-    logger.remove()
-    logger.add(
-        sys.stdout,
-        format="{message}",
-        filter="text_generation",
-        level=logger_level,
-        serialize=json_output,
-        backtrace=True,
-        diagnose=False,
-    )
-    # Setup OpenTelemetry distributed tracing
-    if otlp_endpoint is not None:
-        setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
-
-    server.serve(model_id, revision, sharded, quantize, uds_path)
-
-
-@app.command()
-def download_weights(
-    model_id: str,
-    revision: Optional[str] = None,
-    extension: str = ".safetensors",
-):
-    utils.download_weights(model_id, revision, extension)
-
-
-if __name__ == "__main__":
-    app()
diff --git a/server/text_generation/models/model.py b/server/text_generation/models/model.py
deleted file mode 100644
index ef6a5682..00000000
--- a/server/text_generation/models/model.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import torch
-
-from abc import ABC, abstractmethod
-from typing import List, Tuple, Optional, TypeVar, Type
-from transformers import PreTrainedTokenizerBase
-
-from text_generation.models.types import Batch, GeneratedText
-
-B = TypeVar("B", bound=Batch)
-
-
-class Model(ABC):
-    def __init__(self, tokenizer: PreTrainedTokenizerBase, device: torch.device):
-        self.tokenizer = tokenizer
-        self.device = device
-
-    @property
-    @abstractmethod
-    def batch_type(self) -> Type[B]:
-        raise NotImplementedError
-
-    @abstractmethod
-    def generate_token(self, batch: B) -> Tuple[List[GeneratedText], Optional[B]]:
-        raise NotImplementedError
diff --git a/server/text_generation/utils.py b/server/text_generation/utils.py
deleted file mode 100644
index 3b3f08c7..00000000
--- a/server/text_generation/utils.py
+++ /dev/null
@@ -1,283 +0,0 @@
-import concurrent
-import os
-import re
-import torch
-import torch.distributed
-
-from datetime import timedelta
-
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
-from pathlib import Path
-from huggingface_hub import HfApi, hf_hub_download, _CACHED_NO_EXIST
-from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
-from huggingface_hub.utils import LocalEntryNotFoundError
-from tqdm import tqdm
-from typing import List, Optional, Tuple
-from transformers import PreTrainedTokenizerBase
-from transformers.generation.logits_process import (
-    LogitsProcessorList,
-    RepetitionPenaltyLogitsProcessor,
-    TemperatureLogitsWarper,
-    TopPLogitsWarper,
-    TopKLogitsWarper,
-)
-
-from text_generation.pb import generate_pb2
-from text_generation.pb.generate_pb2 import FinishReason
-
-WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
-
-
-class Sampling:
-    def __init__(self, seed: int, device: str = "cpu"):
-        self.generator = torch.Generator(device)
-        self.generator.manual_seed(seed)
-        self.seed = seed
-
-    def __call__(self, logits):
-        probs = torch.nn.functional.softmax(logits)
-        next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
-        return next_tokens
-
-
-class Greedy:
-    def __call__(self, logits):
-        return logits.argmax()
-
-
-class NextTokenChooser:
-    def __init__(
-        self,
-        temperature=1.0,
-        repetition_penalty=1.0,
-        top_k=None,
-        top_p=None,
-        do_sample=False,
-        seed=0,
-        device="cpu",
-    ):
-        warpers = LogitsProcessorList()
-        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
-        # all samplers can be found in `generation_utils_samplers.py`
-        sampling = do_sample
-        if temperature is not None and temperature != 1.0:
-            temperature = float(temperature)
-            warpers.append(TemperatureLogitsWarper(temperature))
-            sampling = True
-        if top_k is not None and top_k != 0:
-            warpers.append(TopKLogitsWarper(top_k=top_k))
-            sampling = True
-        if top_p is not None and top_p < 1.0:
-            warpers.append(TopPLogitsWarper(top_p=top_p))
-            sampling = True
-        if repetition_penalty is not None and repetition_penalty != 1.0:
-            warpers.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
-
-        self.warpers = warpers
-        self.choice = Sampling(seed, device) if sampling else Greedy()
-
-    def __call__(self, input_ids, scores):
-        # Warp logits
-        scores = self.warpers(input_ids, scores)
-
-        # Compute logprobs
-        logprobs = torch.log_softmax(scores, -1)
-
-        # Choose tokens
-        next_id = self.choice(scores[-1])
-
-        return next_id.view(1, 1), logprobs
-
-    @classmethod
-    def from_pb(
-        cls, pb: generate_pb2.NextTokenChooserParameters, device: torch.device
-    ) -> "NextTokenChooser":
-        return NextTokenChooser(
-            temperature=pb.temperature,
-            repetition_penalty=pb.repetition_penalty,
-            top_k=pb.top_k,
-            top_p=pb.top_p,
-            do_sample=pb.do_sample,
-            seed=pb.seed,
-            device=device,
-        )
-
-
-class StopSequenceCriteria:
-    def __init__(self, stop_sequence: str):
-        self.regex = re.compile(f".*{stop_sequence}$")
-
-    def __call__(self, output: str) -> bool:
-        if self.regex.findall(output):
-            return True
-        return False
-
-
-class StoppingCriteria:
-    def __init__(
-        self,
-        eos_token_id: int,
-        stop_sequence_criterias: List[StopSequenceCriteria],
-        max_new_tokens=20,
-    ):
-        self.eos_token_id = eos_token_id
-        self.stop_sequence_criterias = stop_sequence_criterias
-        self.max_new_tokens = max_new_tokens
-        self.current_tokens = 0
-        self.current_output = ""
-
-    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
-        self.current_tokens += 1
-        if self.current_tokens >= self.max_new_tokens:
-            return True, FinishReason.FINISH_REASON_LENGTH
-
-        if last_token == self.eos_token_id:
-            return True, FinishReason.FINISH_REASON_EOS_TOKEN
-
-        self.current_output += last_output
-        for stop_sequence_criteria in self.stop_sequence_criterias:
-            if stop_sequence_criteria(self.current_output):
-                return True, FinishReason.FINISH_REASON_STOP_SEQUENCE
-
-        return False, None
-
-    @classmethod
-    def from_pb(
-        cls,
-        pb: generate_pb2.StoppingCriteriaParameters,
-        tokenizer: PreTrainedTokenizerBase,
-    ) -> "StoppingCriteria":
-        stop_sequence_criterias = [
-            StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
-        ]
-        return StoppingCriteria(
-            tokenizer.eos_token_id, stop_sequence_criterias, pb.max_new_tokens
-        )
-
-
-def initialize_torch_distributed():
-    rank = int(os.getenv("RANK", "0"))
-    world_size = int(os.getenv("WORLD_SIZE", "1"))
-
-    if torch.cuda.is_available():
-        from torch.distributed import ProcessGroupNCCL
-
-        # Set the device id.
-        assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
-        device = rank % torch.cuda.device_count()
-        torch.cuda.set_device(device)
-        backend = "nccl"
-        options = ProcessGroupNCCL.Options()
-        options.is_high_priority_stream = True
-        options._timeout = timedelta(seconds=60)
-    else:
-        backend = "gloo"
-        options = None
-
-    # Call the init process.
-    torch.distributed.init_process_group(
-        backend=backend,
-        world_size=world_size,
-        rank=rank,
-        timeout=timedelta(seconds=60),
-        pg_options=options,
-    )
-
-    return torch.distributed.group.WORLD, rank, world_size
-
-
-def weight_hub_files(model_id, revision=None, extension=".safetensors"):
-    """Get the safetensors filenames on the hub"""
-    api = HfApi()
-    info = api.model_info(model_id, revision=revision)
-    filenames = [s.rfilename for s in info.siblings if s.rfilename.endswith(extension)]
-    return filenames
-
-
-def try_to_load_from_cache(model_id, revision, filename):
-    """Try to load a file from the Hugging Face cache"""
-    if revision is None:
-        revision = "main"
-
-    object_id = model_id.replace("/", "--")
-    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / f"models--{object_id}"
-
-    if not repo_cache.is_dir():
-        # No cache for this model
-        return None
-
-    refs_dir = repo_cache / "refs"
-    snapshots_dir = repo_cache / "snapshots"
-    no_exist_dir = repo_cache / ".no_exist"
-
-    # Resolve refs (for instance to convert main to the associated commit sha)
-    if refs_dir.is_dir():
-        revision_file = refs_dir / revision
-        if revision_file.exists():
-            with revision_file.open() as f:
-                revision = f.read()
-
-    # Check if file is cached as "no_exist"
-    if (no_exist_dir / revision / filename).is_file():
-        return _CACHED_NO_EXIST
-
-    # Check if revision folder exists
-    if not snapshots_dir.exists():
-        return None
-    cached_shas = os.listdir(snapshots_dir)
-    if revision not in cached_shas:
-        # No cache for this revision and we won't try to return a random revision
-        return None
-
-    # Check if file exists in cache
-    cached_file = snapshots_dir / revision / filename
-    return str(cached_file) if cached_file.is_file() else None
-
-
-def weight_files(model_id, revision=None, extension=".safetensors"):
-    """Get the local safetensors filenames"""
-    if WEIGHTS_CACHE_OVERRIDE is not None:
-        return list(Path(WEIGHTS_CACHE_OVERRIDE).glob(f"*{extension}"))
-
-    filenames = weight_hub_files(model_id, revision, extension)
-    files = []
-    for filename in filenames:
-        cache_file = try_to_load_from_cache(
-            model_id, revision=revision, filename=filename
-        )
-        if cache_file is None:
-            raise LocalEntryNotFoundError(
-                f"File {filename} of model {model_id} not found in "
-                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
-                f"Please run `text-generation-server download-weights {model_id}` first."
-            )
-        files.append(cache_file)
-
-    return files
-
-
-def download_weights(model_id, revision=None, extension=".safetensors"):
-    """Download the safetensors files from the hub"""
-    if WEIGHTS_CACHE_OVERRIDE is not None:
-        return list(Path(WEIGHTS_CACHE_OVERRIDE).glob(f"*{extension}"))
-
-    filenames = weight_hub_files(model_id, revision, extension)
-
-    download_function = partial(
-        hf_hub_download,
-        repo_id=model_id,
-        local_files_only=False,
-    )
-
-    executor = ThreadPoolExecutor(max_workers=5)
-    futures = [
-        executor.submit(download_function, filename=filename, revision=revision)
-        for filename in filenames
-    ]
-    files = [
-        future.result()
-        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures))
-    ]
-
-    return files
diff --git a/server/text_generation/__init__.py b/server/text_generation_server/__init__.py
similarity index 100%
rename from server/text_generation/__init__.py
rename to server/text_generation_server/__init__.py
diff --git a/server/text_generation/cache.py b/server/text_generation_server/cache.py
similarity index 90%
rename from server/text_generation/cache.py
rename to server/text_generation_server/cache.py
index 5a3a8d31..72dc4857 100644
--- a/server/text_generation/cache.py
+++ b/server/text_generation_server/cache.py
@@ -1,6 +1,6 @@
 from typing import Dict, Optional, TypeVar
 
-from text_generation.models.types import Batch
+from text_generation_server.models.types import Batch
 
 B = TypeVar("B", bound=Batch)
 
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
new file mode 100644
index 00000000..6308ef6b
--- /dev/null
+++ b/server/text_generation_server/cli.py
@@ -0,0 +1,115 @@
+import os
+import sys
+import typer
+
+from pathlib import Path
+from loguru import logger
+from typing import Optional
+
+from text_generation_server import server, utils
+from text_generation_server.tracing import setup_tracing
+
+app = typer.Typer()
+
+
+@app.command()
+def serve(
+    model_id: str,
+    revision: Optional[str] = None,
+    sharded: bool = False,
+    quantize: bool = False,
+    uds_path: Path = "/tmp/text-generation",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+):
+    if sharded:
+        assert (
+            os.getenv("RANK", None) is not None
+        ), "RANK must be set when sharded is True"
+        assert (
+            os.getenv("WORLD_SIZE", None) is not None
+        ), "WORLD_SIZE must be set when sharded is True"
+        assert (
+            os.getenv("MASTER_ADDR", None) is not None
+        ), "MASTER_ADDR must be set when sharded is True"
+        assert (
+            os.getenv("MASTER_PORT", None) is not None
+        ), "MASTER_PORT must be set when sharded is True"
+
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+    # Setup OpenTelemetry distributed tracing
+    if otlp_endpoint is not None:
+        setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint)
+
+    server.serve(model_id, revision, sharded, quantize, uds_path)
+
+
+@app.command()
+def download_weights(
+    model_id: str,
+    revision: Optional[str] = None,
+    extension: str = ".safetensors",
+    logger_level: str = "INFO",
+    json_output: bool = False,
+):
+    # Remove default handler
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        format="{message}",
+        filter="text_generation_server",
+        level=logger_level,
+        serialize=json_output,
+        backtrace=True,
+        diagnose=False,
+    )
+
+    # Test if files were already download
+    try:
+        utils.weight_files(model_id, revision, extension)
+        logger.info(
+            "Files are already present in the local cache. " "Skipping download."
+        )
+        return
+    # Local files not found
+    except utils.LocalEntryNotFoundError:
+        pass
+
+    # Download weights directly
+    try:
+        filenames = utils.weight_hub_files(model_id, revision, extension)
+        utils.download_weights(filenames, model_id, revision)
+    except utils.EntryNotFoundError as e:
+        if not extension == ".safetensors":
+            raise e
+
+        logger.warning(
+            f"No safetensors weights found for model {model_id} at revision {revision}. "
+            f"Converting PyTorch weights instead."
+        )
+
+        # Try to see if there are pytorch weights
+        pt_filenames = utils.weight_hub_files(model_id, revision, ".bin")
+        # Download pytorch weights
+        local_pt_files = utils.download_weights(pt_filenames, model_id, revision)
+        local_st_files = [
+            p.parent / f"{p.stem.lstrip('pytorch_')}.safetensors"
+            for p in local_pt_files
+        ]
+        # Convert pytorch weights to safetensors
+        utils.convert_files(local_pt_files, local_st_files)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/server/text_generation/interceptor.py b/server/text_generation_server/interceptor.py
similarity index 100%
rename from server/text_generation/interceptor.py
rename to server/text_generation_server/interceptor.py
diff --git a/server/text_generation/models/__init__.py b/server/text_generation_server/models/__init__.py
similarity index 63%
rename from server/text_generation/models/__init__.py
rename to server/text_generation_server/models/__init__.py
index 7445b427..3e2f5c66 100644
--- a/server/text_generation/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -3,14 +3,14 @@ import torch
 from transformers import AutoConfig
 from typing import Optional
 
-from text_generation.models.model import Model
-from text_generation.models.causal_lm import CausalLM
-from text_generation.models.bloom import BLOOM, BLOOMSharded
-from text_generation.models.seq2seq_lm import Seq2SeqLM
-from text_generation.models.galactica import Galactica, GalacticaSharded
-from text_generation.models.santacoder import SantaCoder
-from text_generation.models.gpt_neox import GPTNeox, GPTNeoxSharded
-from text_generation.models.t5 import T5Sharded
+from text_generation_server.models.model import Model
+from text_generation_server.models.causal_lm import CausalLM
+from text_generation_server.models.bloom import BLOOM, BLOOMSharded
+from text_generation_server.models.seq2seq_lm import Seq2SeqLM
+from text_generation_server.models.galactica import Galactica, GalacticaSharded
+from text_generation_server.models.santacoder import SantaCoder
+from text_generation_server.models.gpt_neox import GPTNeoxSharded
+from text_generation_server.models.t5 import T5Sharded
 
 __all__ = [
     "Model",
@@ -19,7 +19,6 @@ __all__ = [
     "CausalLM",
     "Galactica",
     "GalacticaSharded",
-    "GPTNeox",
     "GPTNeoxSharded",
     "Seq2SeqLM",
     "SantaCoder",
@@ -41,6 +40,15 @@ torch.set_grad_enabled(False)
 def get_model(
     model_id: str, revision: Optional[str], sharded: bool, quantize: bool
 ) -> Model:
+    if "facebook/galactica" in model_id:
+        if sharded:
+            return GalacticaSharded(model_id, revision, quantize=quantize)
+        else:
+            return Galactica(model_id, revision, quantize=quantize)
+
+    if "santacoder" in model_id:
+        return SantaCoder(model_id, revision, quantize)
+
     config = AutoConfig.from_pretrained(model_id, revision=revision)
 
     if config.model_type == "bloom":
@@ -48,27 +56,22 @@ def get_model(
             return BLOOMSharded(model_id, revision, quantize=quantize)
         else:
             return BLOOM(model_id, revision, quantize=quantize)
-    elif config.model_type == "gpt_neox":
+
+    if config.model_type == "gpt_neox":
         if sharded:
             return GPTNeoxSharded(model_id, revision, quantize=quantize)
         else:
-            return GPTNeox(model_id, revision, quantize=quantize)
-    elif config.model_type == "t5":
+            return CausalLM(model_id, revision, quantize=quantize)
+
+    if config.model_type == "t5":
         if sharded:
             return T5Sharded(model_id, revision, quantize=quantize)
         else:
             return Seq2SeqLM(model_id, revision, quantize=quantize)
-    elif model_id.startswith("facebook/galactica"):
-        if sharded:
-            return GalacticaSharded(model_id, revision, quantize=quantize)
-        else:
-            return Galactica(model_id, revision, quantize=quantize)
-    elif "santacoder" in model_id:
-        return SantaCoder(model_id, revision, quantize)
-    else:
-        if sharded:
-            raise ValueError("sharded is not supported for AutoModel")
-        try:
-            return CausalLM(model_id, revision, quantize=quantize)
-        except Exception:
-            return Seq2SeqLM(model_id, revision, quantize=quantize)
+
+    if sharded:
+        raise ValueError("sharded is not supported for AutoModel")
+    try:
+        return CausalLM(model_id, revision, quantize=quantize)
+    except Exception:
+        return Seq2SeqLM(model_id, revision, quantize=quantize)
diff --git a/server/text_generation/models/bloom.py b/server/text_generation_server/models/bloom.py
similarity index 93%
rename from server/text_generation/models/bloom.py
rename to server/text_generation_server/models/bloom.py
index 992d7b5b..0d83abe2 100644
--- a/server/text_generation/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -17,13 +17,12 @@ from transformers.models.bloom.parallel_layers import (
     TensorParallelRowLinear,
 )
 
-from text_generation.models import CausalLM
-from text_generation.models.causal_lm import CausalLMBatch
-from text_generation.pb import generate_pb2
-from text_generation.utils import (
+from text_generation_server.models import CausalLM
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
-    download_weights,
 )
 
 HAS_BITS_AND_BYTES = True
@@ -59,9 +58,6 @@ class BLOOMSharded(BLOOM):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        if not model_id.startswith("bigscience/bloom"):
-            raise ValueError(f"Model {model_id} is not supported")
-
         self.process_group, self.rank, self.world_size = initialize_torch_distributed()
         self.master = self.rank == 0
         if torch.cuda.is_available():
@@ -80,14 +76,8 @@ class BLOOMSharded(BLOOM):
         )
         config.pad_token_id = 3
 
-        # Only download weights for small models
-        if self.master and model_id == "bigscience/bloom-560m":
-            download_weights(model_id, revision=revision, extension=".safetensors")
-
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        if not filenames:
-            raise ValueError("No safetensors weights found")
 
         with init_empty_weights():
             model = AutoModelForCausalLM.from_config(config)
diff --git a/server/text_generation/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
similarity index 80%
rename from server/text_generation/models/causal_lm.py
rename to server/text_generation_server/models/causal_lm.py
index 1ac073b3..88ea6c75 100644
--- a/server/text_generation/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -5,10 +5,15 @@ from opentelemetry import trace
 from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase
 from typing import Optional, Tuple, List, Type
 
-from text_generation.models import Model
-from text_generation.models.types import Batch, PrefillTokens, Generation, GeneratedText
-from text_generation.pb import generate_pb2
-from text_generation.utils import NextTokenChooser, StoppingCriteria, Sampling
+from text_generation_server.models import Model
+from text_generation_server.models.types import (
+    Batch,
+    PrefillTokens,
+    Generation,
+    GeneratedText,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
 
 tracer = trace.get_tracer(__name__)
 
@@ -36,7 +41,8 @@ class CausalLMBatch(Batch):
 
     # Metadata used for padding
     size: int
-    max_sequence_length: int
+    max_input_length: int
+    padding_right_offset: int
 
     # Past metadata
     keys_head_dim_last: bool = True
@@ -61,22 +67,36 @@ class CausalLMBatch(Batch):
         input_lengths = []
 
         # Parse batch
+        padding_right_offset = 0
         for r in pb.requests:
             inputs.append(r.inputs)
-            input_lengths.append(r.input_length)
             next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
-            stopping_criterias.append(
-                StoppingCriteria.from_pb(r.stopping_parameters, tokenizer)
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
             )
 
-        pad_to_multiple_of = 8 if device.type == "cuda" else None
         tokenized_inputs = tokenizer(
             inputs,
             return_tensors="pt",
             padding=True,
-            pad_to_multiple_of=pad_to_multiple_of,
             return_token_type_ids=False,
         ).to(device)
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+
+        input_ids = tokenized_inputs["input_ids"]
+        # Allocate maximum attention_mask
+        attention_mask = input_ids.new_zeros(
+            (pb.size, max_input_length + padding_right_offset)
+        )
+        # Copy tokenizer attention_mask into fully allocated attention_mask
+        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]
+
         position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
         position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
         all_input_ids = tokenized_inputs["input_ids"].unsqueeze(-1)
@@ -84,24 +104,30 @@ class CausalLMBatch(Batch):
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
-            input_ids=tokenized_inputs["input_ids"],
-            attention_mask=tokenized_inputs["attention_mask"],
+            input_ids=input_ids,
+            attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=None,
             all_input_ids=all_input_ids,
-            input_lengths=input_lengths,
+            input_lengths=input_lengths.tolist(),
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             size=pb.size,
-            max_sequence_length=max(input_lengths),
+            max_input_length=max_input_length.item(),
+            padding_right_offset=padding_right_offset,
         )
 
     @classmethod
     @tracer.start_as_current_span("concatenate")
     def concatenate(cls, batches: List["CausalLMBatch"]) -> "CausalLMBatch":
         # Used for padding
-        total_batch_size = sum(batch.size for batch in batches)
-        max_sequence_length = max(batch.max_sequence_length for batch in batches)
+        total_batch_size = 0
+        max_input_length = 0
+        padding_right_offset = 0
+        for batch in batches:
+            total_batch_size += batch.size
+            max_input_length = max(max_input_length, batch.max_input_length)
+            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)
 
         # Batch attributes
         requests = []
@@ -144,13 +170,24 @@ class CausalLMBatch(Batch):
             # Create padded tensor
             if attention_mask is None:
                 attention_mask = batch.attention_mask.new_zeros(
-                    (total_batch_size, max_sequence_length),
+                    (total_batch_size, max_input_length + padding_right_offset),
                 )
 
             # We need to slice the attention mask to remove padding from previous steps
+            # and to remove unused allocated space
+            left_offset = max_input_length - batch.max_input_length
+            batch_left_offset = (
+                batch.attention_mask.shape[1]
+                - batch.max_input_length
+                - batch.padding_right_offset
+            )
             attention_mask[
-                start_index:end_index, -batch.max_sequence_length :
-            ] = batch.attention_mask[:, -batch.max_sequence_length :]
+                start_index:end_index,
+                left_offset:-padding_right_offset,
+            ] = batch.attention_mask[
+                :,
+                batch_left_offset : -batch.padding_right_offset,
+            ]
 
             # Create empty tensor
             # position_ids is always of shape [batch_size, 1]
@@ -172,7 +209,7 @@ class CausalLMBatch(Batch):
                 padded_past_values_shape = (
                     total_batch_size,
                     num_heads,
-                    max_sequence_length - 1,
+                    max_input_length - 1,
                     head_dim,
                 )
 
@@ -184,7 +221,7 @@ class CausalLMBatch(Batch):
                         total_batch_size,
                         num_heads,
                         head_dim,
-                        max_sequence_length - 1,
+                        max_input_length - 1,
                     )
 
                 # This will run only once per layer
@@ -198,20 +235,20 @@ class CausalLMBatch(Batch):
                     past_key_values[j][0][
                         start_index:end_index,
                         :,
-                        -(batch.max_sequence_length - 1) :,
+                        -(batch.max_input_length - 1) :,
                         :,
-                    ] = past_keys[:, :, -(batch.max_sequence_length - 1) :, :]
+                    ] = past_keys[:, :, -(batch.max_input_length - 1) :, :]
                 else:
                     past_key_values[j][0][
                         start_index:end_index,
                         :,
                         :,
-                        -(batch.max_sequence_length - 1) :,
-                    ] = past_keys[:, :, :, -(batch.max_sequence_length - 1) :]
+                        -(batch.max_input_length - 1) :,
+                    ] = past_keys[:, :, :, -(batch.max_input_length - 1) :]
 
                 past_key_values[j][1][
-                    start_index:end_index, :, -(batch.max_sequence_length - 1) :, :
-                ] = past_values[:, :, -(batch.max_sequence_length - 1) :, :]
+                    start_index:end_index, :, -(batch.max_input_length - 1) :, :
+                ] = past_values[:, :, -(batch.max_input_length - 1) :, :]
 
             start_index += batch.size
 
@@ -227,7 +264,8 @@ class CausalLMBatch(Batch):
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             size=total_batch_size,
-            max_sequence_length=max_sequence_length,
+            max_input_length=max_input_length,
+            padding_right_offset=padding_right_offset,
             keys_head_dim_last=batches[0].keys_head_dim_last,
         )
 
@@ -294,9 +332,12 @@ class CausalLM(Model):
     def generate_token(
         self, batch: CausalLMBatch
     ) -> Tuple[List[Generation], Optional[CausalLMBatch]]:
+        # slice the attention mask to the correct shape
+        attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
+
         logits, past = self.forward(
             batch.input_ids,
-            batch.attention_mask,
+            attention_mask,
             batch.position_ids,
             batch.past_key_values,
         )
@@ -311,7 +352,7 @@ class CausalLM(Model):
 
         # Metadata
         next_batch_size = 0
-        next_batch_max_sequence_length = 0
+        next_batch_max_input_length = 0
 
         # Results
         generations: List[Generation] = []
@@ -347,10 +388,8 @@ class CausalLM(Model):
             # Generated token
             next_token_logprob = logprobs[-1, next_token_id]
             next_token_id_squeezed = next_token_id.squeeze()
-            next_token_text = self.tokenizer.decode(
+            next_token_text = self.decode_token(
                 next_token_id_squeezed,
-                clean_up_tokenization_spaces=False,
-                skip_special_tokens=False,
             )
 
             # Evaluate stopping criteria
@@ -381,8 +420,8 @@ class CausalLM(Model):
                 next_batch_all_input_ids.append(all_input_ids)
                 next_batch_size += 1
                 next_batch_input_lengths.append(new_input_length)
-                next_batch_max_sequence_length = max(
-                    next_batch_max_sequence_length, new_input_length
+                next_batch_max_input_length = max(
+                    next_batch_max_input_length, new_input_length
                 )
 
             # Prefill
@@ -409,6 +448,7 @@ class CausalLM(Model):
                 next_token_id_squeezed,
                 next_token_logprob,
                 next_token_text,
+                next_token_id_squeezed.item() in self.all_special_ids,
                 generated_text,
             )
 
@@ -448,14 +488,8 @@ class CausalLM(Model):
             next_batch_next_token_choosers = batch.next_token_choosers
             next_batch_stopping_criterias = batch.stopping_criterias
 
-        # Update attention_mask with padding as we added a new token to input_ids
-        next_batch_attention_mask = torch.cat(
-            [
-                next_batch_attention_mask,
-                next_batch_attention_mask.new_ones(next_batch_size, 1),
-            ],
-            dim=1,
-        )
+        # Update attention_mask as we added a new token to input_ids
+        next_batch_attention_mask[:, -batch.padding_right_offset] = 1
 
         # Update position_ids
         next_batch_position_ids = next_batch_position_ids[:, -1:] + 1
@@ -472,7 +506,8 @@ class CausalLM(Model):
             next_token_choosers=next_batch_next_token_choosers,
             stopping_criterias=next_batch_stopping_criterias,
             size=next_batch_size,
-            max_sequence_length=next_batch_max_sequence_length,
+            max_input_length=next_batch_max_input_length,
+            padding_right_offset=batch.padding_right_offset - 1,
             keys_head_dim_last=batch.keys_head_dim_last,
         )
         return generations, next_batch
diff --git a/server/text_generation/models/galactica.py b/server/text_generation_server/models/galactica.py
similarity index 87%
rename from server/text_generation/models/galactica.py
rename to server/text_generation_server/models/galactica.py
index f1dc8a30..a90a299e 100644
--- a/server/text_generation/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -2,7 +2,7 @@ import re
 import torch
 import torch.distributed
 
-from typing import List, Optional, Type
+from typing import List, Optional, Type, Tuple
 
 from accelerate import init_empty_weights
 from safetensors import safe_open
@@ -18,15 +18,14 @@ from transformers.models.opt.parallel_layers import (
     TensorParallelRowLinear,
 )
 
-from text_generation.models import CausalLM
-from text_generation.pb import generate_pb2
-from text_generation.models.causal_lm import CausalLMBatch
-from text_generation.utils import (
+from text_generation_server.models import CausalLM
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.utils import (
     NextTokenChooser,
     StoppingCriteria,
     initialize_torch_distributed,
     weight_files,
-    download_weights,
 )
 
 HAS_BITS_AND_BYTES = True
@@ -97,24 +96,37 @@ class GalacticaCausalLMBatch(CausalLMBatch):
         input_lengths = []
 
         # Parse batch
+        max_sequence_length = 0
+        padding_right_offset = 0
         for r in pb.requests:
             # Add escape_custom_split_sequence to the CausalLMBatch logic
             inputs.append(escape_custom_split_sequence(r.inputs))
             input_lengths.append(r.input_length)
             next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
-            stopping_criterias.append(
-                StoppingCriteria.from_pb(r.stopping_parameters, tokenizer)
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            max_sequence_length = max(max_sequence_length, r.input_length)
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
             )
 
         # Tokenize batch
-        pad_to_multiple_of = 8 if device.type == "cuda" else None
         tokenized_inputs = tokenizer(
             inputs,
             return_tensors="pt",
             padding=True,
-            pad_to_multiple_of=pad_to_multiple_of,
             return_token_type_ids=False,
         ).to(device)
+        input_ids = tokenized_inputs["input_ids"]
+        # Allocate maximum attention_mask
+        attention_mask = input_ids.new_zeros(
+            (pb.size, max_sequence_length + padding_right_offset)
+        )
+        # Copy tokenizer attention_mask into fully allocated attention_mask
+        attention_mask[:, :max_sequence_length] = tokenized_inputs["attention_mask"]
+
         position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
         position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
         all_input_ids = tokenized_inputs["input_ids"].unsqueeze(-1)
@@ -122,8 +134,8 @@ class GalacticaCausalLMBatch(CausalLMBatch):
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
-            input_ids=tokenized_inputs["input_ids"],
-            attention_mask=tokenized_inputs["attention_mask"],
+            input_ids=input_ids,
+            attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=None,
             all_input_ids=all_input_ids,
@@ -131,7 +143,8 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             size=pb.size,
-            max_sequence_length=max(input_lengths),
+            max_sequence_length=max_sequence_length,
+            padding_right_offset=padding_right_offset,
         )
 
 
@@ -146,14 +159,25 @@ class Galactica(CausalLM):
             generated_ids, skip_special_tokens=False, cleanup_tokenization_spaces=False
         )
 
+    def forward(
+        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
+    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+        """Overwrite forward to ignore position_ids"""
+
+        # Model Forward
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+        return outputs.logits, outputs.past_key_values
+
 
 class GalacticaSharded(Galactica):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
-        if not model_id.startswith("facebook/galactica"):
-            raise ValueError(f"Model {model_id} is not supported")
-
         self.process_group, self.rank, self.world_size = initialize_torch_distributed()
         self.master = self.rank == 0
         if torch.cuda.is_available():
@@ -172,14 +196,8 @@ class GalacticaSharded(Galactica):
         )
         tokenizer.pad_token_id = config.pad_token_id
 
-        # Only download weights for small models
-        if self.master and model_id == "facebook/galactica-125m":
-            download_weights(model_id, revision=revision, extension=".safetensors")
-
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        if not filenames:
-            raise ValueError("No safetensors weights found")
 
         with init_empty_weights():
             model = AutoModelForCausalLM.from_config(config)
@@ -329,7 +347,6 @@ class GalacticaSharded(Galactica):
         outputs = self.model.forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            position_ids=position_ids,
             past_key_values=past_key_values,
             use_cache=True,
         )
diff --git a/server/text_generation/models/gpt_neox.py b/server/text_generation_server/models/gpt_neox.py
similarity index 90%
rename from server/text_generation/models/gpt_neox.py
rename to server/text_generation_server/models/gpt_neox.py
index 2d467f4c..8fabefe3 100644
--- a/server/text_generation/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@@ -1,7 +1,7 @@
 import torch
 import torch.distributed
 
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 from accelerate import init_empty_weights
 from safetensors import safe_open
@@ -16,11 +16,10 @@ from transformers.models.gpt_neox.parallel_layers import (
     TensorParallelRowLinear,
 )
 
-from text_generation.models import CausalLM
-from text_generation.utils import (
+from text_generation_server.models import CausalLM
+from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
-    download_weights,
 )
 
 HAS_BITS_AND_BYTES = True
@@ -31,23 +30,7 @@ except Exception as e:
     HAS_BITS_AND_BYTES = False
 
 
-class GPTNeox(CausalLM):
-    def forward(
-        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
-        """Overwrite forward to ignore position_ids"""
-
-        # Model Forward
-        outputs = self.model.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            use_cache=True,
-        )
-        return outputs.logits, outputs.past_key_values
-
-
-class GPTNeoxSharded(GPTNeox):
+class GPTNeoxSharded(CausalLM):
     def __init__(
         self, model_id: str, revision: Optional[str] = None, quantize: bool = False
     ):
@@ -69,14 +52,8 @@ class GPTNeoxSharded(GPTNeox):
             model_id, revision=revision, tp_parallel=True
         )
 
-        # Only master download weights
-        if self.master:
-            download_weights(model_id, revision=revision, extension=".safetensors")
-
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        if not filenames:
-            raise ValueError("No safetensors weights found")
 
         with init_empty_weights():
             model = AutoModelForCausalLM.from_config(config)
@@ -231,6 +208,7 @@ class GPTNeoxSharded(GPTNeox):
             outputs = self.model.forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                position_ids=position_ids,
                 past_key_values=past_key_values,
                 use_cache=True,
             )
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
new file mode 100644
index 00000000..e0ce6686
--- /dev/null
+++ b/server/text_generation_server/models/model.py
@@ -0,0 +1,43 @@
+import torch
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Optional, TypeVar, Type
+from transformers import PreTrainedTokenizerBase
+
+from text_generation_server.models.types import Batch, GeneratedText
+
+B = TypeVar("B", bound=Batch)
+
+
+class Model(ABC):
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, device: torch.device):
+        self.tokenizer = tokenizer
+        self.all_special_ids = set(tokenizer.all_special_ids)
+        self.device = device
+
+        # see `decode_token` method
+        self.tokenizer.add_special_tokens(
+            {"additional_special_tokens": ["<decode-token>"]}
+        )
+        self.special_decode_token_id = self.tokenizer.convert_tokens_to_ids(
+            "<decode-token>"
+        )
+        self.special_decode_token_length = len("<decode-token>")
+
+    @property
+    @abstractmethod
+    def batch_type(self) -> Type[B]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def generate_token(self, batch: B) -> Tuple[List[GeneratedText], Optional[B]]:
+        raise NotImplementedError
+
+    def decode_token(self, token_id: int) -> str:
+        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
+        # append token to special decode token and decode both
+        result = self.tokenizer.decode(
+            [self.special_decode_token_id, token_id], skip_special_tokens=False
+        )
+        # slice to remove special decode token
+        return result[self.special_decode_token_length :]
diff --git a/server/text_generation/models/santacoder.py b/server/text_generation_server/models/santacoder.py
similarity index 95%
rename from server/text_generation/models/santacoder.py
rename to server/text_generation_server/models/santacoder.py
index fb496197..fe15cde0 100644
--- a/server/text_generation/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -1,10 +1,10 @@
 import torch
 import torch.distributed
 
-from typing import Optional, List, Tuple
+from typing import Optional, List
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
-from text_generation.models import CausalLM
+from text_generation_server.models import CausalLM
 
 FIM_PREFIX = "<fim-prefix>"
 FIM_MIDDLE = "<fim-middle>"
diff --git a/server/text_generation/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
similarity index 82%
rename from server/text_generation/models/seq2seq_lm.py
rename to server/text_generation_server/models/seq2seq_lm.py
index 2f28c4ce..0fe5c03f 100644
--- a/server/text_generation/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -5,10 +5,15 @@ from opentelemetry import trace
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PreTrainedTokenizerBase
 from typing import Optional, Tuple, List, Type
 
-from text_generation.models import Model
-from text_generation.models.types import GeneratedText, Batch, Generation, PrefillTokens
-from text_generation.pb import generate_pb2
-from text_generation.utils import NextTokenChooser, StoppingCriteria, Sampling
+from text_generation_server.models import Model
+from text_generation_server.models.types import (
+    GeneratedText,
+    Batch,
+    Generation,
+    PrefillTokens,
+)
+from text_generation_server.pb import generate_pb2
+from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
 
 tracer = trace.get_tracer(__name__)
 
@@ -42,9 +47,10 @@ class Seq2SeqLMBatch(Batch):
     size: int
     max_input_length: int
     max_decoder_input_length: int
+    padding_right_offset: int
 
     def to_pb(self) -> generate_pb2.Batch:
-        """Convert a Seq2SeqLMBatch to a text_generation.v1.Batch protobuf"""
+        """Convert a Seq2SeqLMBatch to a text_generation_server.v1.Batch protobuf"""
         return generate_pb2.Batch(
             id=self.batch_id,
             requests=self.requests,
@@ -58,36 +64,41 @@ class Seq2SeqLMBatch(Batch):
         tokenizer: PreTrainedTokenizerBase,
         device: torch.device,
     ) -> "Seq2SeqLMBatch":
-        """Convert a text_generation.v1.Batch protobuf to a Seq2SeqLMBatch"""
+        """Convert a text_generation_server.v1.Batch protobuf to a Seq2SeqLMBatch"""
         inputs = []
         next_token_choosers = []
         stopping_criterias = []
-        input_lengths = []
 
         decoder_input_ids = []
         decoder_input_lengths = []
 
         # Parse batch
+        padding_right_offset = 0
         for r in pb.requests:
             inputs.append(r.inputs)
-            input_lengths.append(r.input_length)
             # Decoder sequence only contains the bos_token
             decoder_input_ids.append(tokenizer.bos_token_id)
             decoder_input_lengths.append(1)
             next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device))
-            stopping_criterias.append(
-                StoppingCriteria.from_pb(r.stopping_parameters, tokenizer)
+            stopping_criteria = StoppingCriteria.from_pb(
+                r.stopping_parameters, tokenizer
+            )
+            stopping_criterias.append(stopping_criteria)
+            padding_right_offset = max(
+                padding_right_offset, stopping_criteria.max_new_tokens
             )
 
         # Tokenize batch
-        pad_to_multiple_of = 8 if device.type == "cuda" else None
         tokenized_inputs = tokenizer(
             inputs,
             return_tensors="pt",
             padding=True,
-            pad_to_multiple_of=pad_to_multiple_of,
             return_token_type_ids=False,
         ).to(device)
+
+        input_lengths = tokenized_inputs["attention_mask"].sum(1)
+        max_input_length = input_lengths.max()
+
         # Convert decoder_input_ids to torch tensor of size [batch_size, 1]
         decoder_input_ids = torch.tensor(decoder_input_ids, device=device).unsqueeze(-1)
 
@@ -100,13 +111,14 @@ class Seq2SeqLMBatch(Batch):
             decoder_attention_mask=None,
             encoder_last_hidden_state=None,
             past_key_values=None,
-            input_lengths=input_lengths,
+            input_lengths=input_lengths.tolist(),
             decoder_input_lengths=decoder_input_lengths,
             next_token_choosers=next_token_choosers,
             stopping_criterias=stopping_criterias,
             size=len(pb.requests),
-            max_input_length=max(input_lengths),
+            max_input_length=max_input_length.item(),
             max_decoder_input_length=1,
+            padding_right_offset=padding_right_offset,
         )
 
     @classmethod
@@ -115,11 +127,17 @@ class Seq2SeqLMBatch(Batch):
         """Concatenate multiple batches together by padding internal torch tensors"""
 
         # Used for padding
-        total_batch_size = sum(batch.size for batch in batches)
-        max_input_length = max(batch.max_input_length for batch in batches)
-        max_decoder_input_length = max(
-            batch.max_decoder_input_length for batch in batches
-        )
+        total_batch_size = 0
+        max_input_length = 0
+        max_decoder_input_length = 0
+        padding_right_offset = 0
+        for batch in batches:
+            total_batch_size += batch.size
+            max_input_length = max(max_input_length, batch.max_input_length)
+            max_decoder_input_length = max(
+                max_decoder_input_length, batch.max_decoder_input_length
+            )
+            padding_right_offset = max(padding_right_offset, batch.padding_right_offset)
 
         # Batch attributes
         requests = []
@@ -129,7 +147,6 @@ class Seq2SeqLMBatch(Batch):
         stopping_criterias = []
 
         # Batch tensors
-        input_ids = None
         attention_mask = None
         decoder_input_ids = None
         decoder_attention_mask = None
@@ -155,16 +172,6 @@ class Seq2SeqLMBatch(Batch):
             if batch.encoder_last_hidden_state is None:
                 raise ValueError("Batch encoder_last_hidden_state cannot be None")
 
-            # Create padded tensor
-            if input_ids is None:
-                input_ids = batch.input_ids.new_zeros(
-                    (total_batch_size, max_input_length),
-                )
-            # Copy to correct indices
-            input_ids[
-                start_index:end_index, -batch.max_input_length :
-            ] = batch.input_ids[:, -batch.max_input_length :]
-
             # Create padded tensor
             if attention_mask is None:
                 attention_mask = batch.attention_mask.new_zeros(
@@ -189,19 +196,30 @@ class Seq2SeqLMBatch(Batch):
             if decoder_attention_mask is None:
                 # As decoder_attention_mask might not exist, we use `batch.attention_mask` for device here
                 decoder_attention_mask = batch.attention_mask.new_zeros(
-                    (total_batch_size, max_decoder_input_length),
+                    (total_batch_size, max_decoder_input_length + padding_right_offset),
                 )
             # If the decoder mask does not exist yet, all generations started at the same time and we never concatenated
             # this batch. All generations are of length `batch.max_decoder_input_length`.
+            left_offset = max_decoder_input_length - batch.max_decoder_input_length
             if batch.decoder_attention_mask is None:
                 decoder_attention_mask[
-                    start_index:end_index, -batch.max_decoder_input_length :
+                    start_index:end_index,
+                    left_offset:-padding_right_offset,
                 ] = 1
             # If it exists, we need to index
             else:
+                batch_left_offset = (
+                    batch.decoder_attention_mask.shape[1]
+                    - batch.max_decoder_input_length
+                    - batch.padding_right_offset
+                )
                 decoder_attention_mask[
-                    start_index:end_index, -batch.max_decoder_input_length :
-                ] = batch.decoder_attention_mask[:, -batch.max_decoder_input_length :]
+                    start_index:end_index,
+                    left_offset:-padding_right_offset,
+                ] = batch.decoder_attention_mask[
+                    :,
+                    batch_left_offset : -batch.padding_right_offset,
+                ]
 
             # Create padded tensor
             if encoder_last_hidden_state is None:
@@ -273,7 +291,7 @@ class Seq2SeqLMBatch(Batch):
         return cls(
             batch_id=batches[0].batch_id,
             requests=requests,
-            input_ids=input_ids,
+            input_ids=None,
             attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,
             decoder_attention_mask=decoder_attention_mask,
@@ -286,6 +304,7 @@ class Seq2SeqLMBatch(Batch):
             size=total_batch_size,
             max_input_length=max_input_length,
             max_decoder_input_length=max_decoder_input_length,
+            padding_right_offset=padding_right_offset,
         )
 
     def __len__(self):
@@ -326,7 +345,9 @@ class Seq2SeqLM(Model):
         return Seq2SeqLMBatch
 
     def decode(self, decoder_ids: List[int]) -> str:
-        return self.tokenizer.decode(decoder_ids, skip_special_tokens=True)
+        return self.tokenizer.decode(
+            decoder_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
 
     def forward(
         self,
@@ -342,14 +363,6 @@ class Seq2SeqLM(Model):
         List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
     ]:
         # Model Forward
-        if past_key_values is not None:
-            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
-
-        # Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
-        # internally...
-        if encoder_last_hidden_state is not None:
-            encoder_last_hidden_state = [encoder_last_hidden_state]
-
         outputs = self.model.forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -369,12 +382,34 @@ class Seq2SeqLM(Model):
     def generate_token(
         self, batch: Seq2SeqLMBatch
     ) -> Tuple[List[Generation], Optional[Seq2SeqLMBatch]]:
+        if batch.decoder_attention_mask is not None:
+            # slice to the correct shape
+            decoder_attention_mask = batch.decoder_attention_mask[
+                :, : -batch.padding_right_offset
+            ]
+        else:
+            decoder_attention_mask = None
+
+        # check if first forward or not
+        if batch.past_key_values is not None:
+            # Only take the last token
+            decoder_input_ids = batch.decoder_input_ids[:, -1].unsqueeze(-1)
+        else:
+            decoder_input_ids = batch.decoder_input_ids
+
+        # Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
+        # internally...
+        if batch.encoder_last_hidden_state is not None:
+            encoder_last_hidden_state = [batch.encoder_last_hidden_state]
+        else:
+            encoder_last_hidden_state = batch.encoder_last_hidden_state
+
         logits, encoder_last_hidden_state, past = self.forward(
             batch.input_ids,
             batch.attention_mask,
-            batch.decoder_input_ids,
-            batch.decoder_attention_mask,
-            batch.encoder_last_hidden_state,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_last_hidden_state,
             batch.past_key_values,
         )
 
@@ -402,7 +437,6 @@ class Seq2SeqLM(Model):
             logits,
             batch.next_token_choosers,
             batch.stopping_criterias,
-            batch.input_ids,
             batch.decoder_input_ids,
         )
 
@@ -414,7 +448,6 @@ class Seq2SeqLM(Model):
             logits,
             next_token_chooser,
             stopping_criteria,
-            input_tokens,
             decoder_input_ids,
         ) in enumerate(iterator):
             # Select next token
@@ -429,10 +462,8 @@ class Seq2SeqLM(Model):
             # Generated token
             next_token_logprob = logprobs[-1, next_token_id]
             next_token_id_squeezed = next_token_id.squeeze()
-            next_token_text = self.tokenizer.decode(
+            next_token_text = self.decode_token(
                 next_token_id_squeezed,
-                clean_up_tokenization_spaces=False,
-                skip_special_tokens=False,
             )
 
             # Evaluate stopping criteria
@@ -469,14 +500,10 @@ class Seq2SeqLM(Model):
 
             # Prefill
             if stopping_criteria.current_tokens == 1:
-                prefill_token_ids = decoder_input_ids[-new_decoder_input_length:-1]
-                prefill_texts = self.tokenizer.batch_decode(
-                    prefill_token_ids,
-                    clean_up_tokenization_spaces=False,
-                    skip_special_tokens=False,
-                )
                 prefill_tokens = PrefillTokens(
-                    prefill_token_ids, [float("nan")], prefill_texts
+                    [self.tokenizer.bos_token_id],
+                    [float("nan")],
+                    [self.tokenizer.bos_token],
                 )
             else:
                 prefill_tokens = None
@@ -487,6 +514,7 @@ class Seq2SeqLM(Model):
                 next_token_id_squeezed,
                 next_token_logprob,
                 next_token_text,
+                next_token_id_squeezed.item() in self.all_special_ids,
                 generated_text,
             )
 
@@ -500,10 +528,8 @@ class Seq2SeqLM(Model):
         # If we finished at least one generation, we need to evict the indices of the generations that finished
         # from the values of the next batch
         if len(next_batch_keep_indices) != len(batch):
-            # Apply indices to attention mask, past key values and other items that need to be cached
-            next_batch_input_ids = batch.input_ids[next_batch_keep_indices]
+            # Apply indices to decoder_attention mask, past key values and other items that need to be cached
             next_batch_attention_mask = batch.attention_mask[next_batch_keep_indices]
-
             if batch.decoder_attention_mask is not None:
                 next_batch_decoder_attention_mask = batch.decoder_attention_mask[
                     next_batch_keep_indices
@@ -526,7 +552,6 @@ class Seq2SeqLM(Model):
                 batch.stopping_criterias[i] for i in next_batch_keep_indices
             ]
         else:
-            next_batch_input_ids = batch.input_ids
             next_batch_attention_mask = batch.attention_mask
             next_batch_decoder_attention_mask = batch.decoder_attention_mask
             next_batch_encoder_last_hidden_state = encoder_last_hidden_state
@@ -536,20 +561,14 @@ class Seq2SeqLM(Model):
             next_batch_next_token_choosers = batch.next_token_choosers
             next_batch_stopping_criterias = batch.stopping_criterias
 
-        # Update decoder_attention_mask with padding as we added a new token to input_ids
+        # Update decoder_attention_mask as we added a new token to input_ids
         if next_batch_decoder_attention_mask is not None:
-            next_batch_decoder_attention_mask = torch.cat(
-                [
-                    next_batch_decoder_attention_mask,
-                    next_batch_decoder_attention_mask.new_ones(next_batch_size, 1),
-                ],
-                dim=1,
-            )
+            next_batch_decoder_attention_mask[:, -batch.padding_right_offset] = 1
 
         next_batch = Seq2SeqLMBatch(
             batch_id=batch.batch_id,
             requests=next_batch_requests,
-            input_ids=next_batch_input_ids,
+            input_ids=None,
             attention_mask=next_batch_attention_mask,
             decoder_input_ids=next_batch_decoder_input_ids,
             decoder_attention_mask=next_batch_decoder_attention_mask,
@@ -562,5 +581,6 @@ class Seq2SeqLM(Model):
             size=next_batch_size,
             max_input_length=next_batch_max_input_length,
             max_decoder_input_length=next_batch_max_decoder_input_length,
+            padding_right_offset=batch.padding_right_offset - 1,
         )
         return generations, next_batch
diff --git a/server/text_generation/models/t5.py b/server/text_generation_server/models/t5.py
similarity index 92%
rename from server/text_generation/models/t5.py
rename to server/text_generation_server/models/t5.py
index d7241c81..cb4f7f22 100644
--- a/server/text_generation/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -16,11 +16,10 @@ from transformers.models.t5.parallel_layers import (
     TensorParallelRowLinear,
 )
 
-from text_generation.models import Seq2SeqLM
-from text_generation.utils import (
+from text_generation_server.models import Seq2SeqLM
+from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
-    download_weights,
 )
 
 HAS_BITS_AND_BYTES = True
@@ -53,14 +52,8 @@ class T5Sharded(Seq2SeqLM):
         )
         tokenizer.bos_token_id = config.decoder_start_token_id
 
-        # Only master download weights
-        if self.master:
-            download_weights(model_id, revision=revision, extension=".safetensors")
-
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        if not filenames:
-            raise ValueError("No safetensors weights found")
 
         with init_empty_weights():
             model = AutoModelForSeq2SeqLM.from_config(config)
@@ -228,14 +221,6 @@ class T5Sharded(Seq2SeqLM):
         List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
     ]:
         # Model Forward
-        if past_key_values is not None:
-            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
-
-        # Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
-        # internally...
-        if encoder_last_hidden_state is not None:
-            encoder_last_hidden_state = [encoder_last_hidden_state]
-
         outputs = self.model.forward(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/server/text_generation/models/types.py b/server/text_generation_server/models/types.py
similarity index 91%
rename from server/text_generation/models/types.py
rename to server/text_generation_server/models/types.py
index d1117b80..93c3b9db 100644
--- a/server/text_generation/models/types.py
+++ b/server/text_generation_server/models/types.py
@@ -6,8 +6,8 @@ from typing import List, Optional
 
 from transformers import PreTrainedTokenizerBase
 
-from text_generation.pb import generate_pb2
-from text_generation.pb.generate_pb2 import FinishReason
+from text_generation_server.pb import generate_pb2
+from text_generation_server.pb.generate_pb2 import FinishReason
 
 
 class Batch(ABC):
@@ -73,6 +73,7 @@ class Generation:
     token_id: int
     token_logprob: float
     token_text: str
+    token_is_special: bool
     generated_text: Optional[GeneratedText]
 
     def to_pb(self) -> generate_pb2.Generation:
@@ -84,6 +85,7 @@ class Generation:
             token_id=self.token_id,
             token_logprob=self.token_logprob,
             token_text=self.token_text,
+            token_is_special=self.token_is_special,
             generated_text=self.generated_text.to_pb()
             if self.generated_text is not None
             else None,
diff --git a/server/text_generation/pb/.gitignore b/server/text_generation_server/pb/.gitignore
similarity index 100%
rename from server/text_generation/pb/.gitignore
rename to server/text_generation_server/pb/.gitignore
diff --git a/server/text_generation/server.py b/server/text_generation_server/server.py
similarity index 92%
rename from server/text_generation/server.py
rename to server/text_generation_server/server.py
index f3129cb4..0b75c3c7 100644
--- a/server/text_generation/server.py
+++ b/server/text_generation_server/server.py
@@ -9,11 +9,11 @@ from grpc_reflection.v1alpha import reflection
 from pathlib import Path
 from typing import List, Optional
 
-from text_generation.cache import Cache
-from text_generation.interceptor import ExceptionInterceptor
-from text_generation.models import Model, get_model
-from text_generation.pb import generate_pb2_grpc, generate_pb2
-from text_generation.tracing import UDSOpenTelemetryAioServerInterceptor
+from text_generation_server.cache import Cache
+from text_generation_server.interceptor import ExceptionInterceptor
+from text_generation_server.models import Model, get_model
+from text_generation_server.pb import generate_pb2_grpc, generate_pb2
+from text_generation_server.tracing import UDSOpenTelemetryAioServerInterceptor
 
 
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
diff --git a/server/text_generation/tracing.py b/server/text_generation_server/tracing.py
similarity index 100%
rename from server/text_generation/tracing.py
rename to server/text_generation_server/tracing.py
diff --git a/server/text_generation_server/utils/__init__.py b/server/text_generation_server/utils/__init__.py
new file mode 100644
index 00000000..50d64518
--- /dev/null
+++ b/server/text_generation_server/utils/__init__.py
@@ -0,0 +1,36 @@
+from text_generation_server.utils.convert import convert_file, convert_files
+from text_generation_server.utils.dist import initialize_torch_distributed
+from text_generation_server.utils.hub import (
+    weight_files,
+    weight_hub_files,
+    download_weights,
+    EntryNotFoundError,
+    LocalEntryNotFoundError,
+    RevisionNotFoundError,
+)
+from text_generation_server.utils.tokens import (
+    Greedy,
+    NextTokenChooser,
+    Sampling,
+    StoppingCriteria,
+    StopSequenceCriteria,
+    FinishReason,
+)
+
+__all__ = [
+    "convert_file",
+    "convert_files",
+    "initialize_torch_distributed",
+    "weight_files",
+    "weight_hub_files",
+    "download_weights",
+    "EntryNotFoundError",
+    "LocalEntryNotFoundError",
+    "RevisionNotFoundError",
+    "Greedy",
+    "NextTokenChooser",
+    "Sampling",
+    "StoppingCriteria",
+    "StopSequenceCriteria",
+    "FinishReason",
+]
diff --git a/server/text_generation_server/utils/convert.py b/server/text_generation_server/utils/convert.py
new file mode 100644
index 00000000..437e2308
--- /dev/null
+++ b/server/text_generation_server/utils/convert.py
@@ -0,0 +1,94 @@
+import concurrent
+import time
+import torch
+
+from concurrent.futures import ThreadPoolExecutor
+from collections import defaultdict
+from datetime import timedelta
+from loguru import logger
+from pathlib import Path
+from safetensors.torch import load_file, save_file
+from typing import Dict, List
+
+
+def check_file_size(source_file: Path, target_file: Path):
+    """
+    Check that two files are close in size
+    """
+    source_file_size = source_file.stat().st_size
+    target_file_size = target_file.stat().st_size
+
+    if (source_file_size - target_file_size) / source_file_size > 0.01:
+        raise RuntimeError(
+            f"""The file size different is more than 1%:
+         - {source_file}: {source_file_size}
+         - {target_file}: {target_file_size}
+         """
+        )
+
+
+def remove_shared_pointers(tensors: Dict[str, torch.Tensor]):
+    """
+    For a Dict of tensors, check if two or more tensors point to the same underlying memory and
+    remove them
+    """
+    ptrs = defaultdict(list)
+    for k, v in tensors.items():
+        ptrs[v.data_ptr()].append(k)
+
+    # Iterate over all found memory addresses
+    for ptr, names in ptrs.items():
+        if len(names) > 1:
+            # Multiple tensors are point to the same memory
+            # Only keep the first tensor
+            for name in names[1:]:
+                tensors.pop(name)
+
+
+def convert_file(pt_file: Path, st_file: Path):
+    """
+    Convert a pytorch file to a safetensors file
+    """
+    logger.info(f"Convert {pt_file} to {st_file}.")
+
+    pt_state = torch.load(pt_file, map_location="cpu")
+    if "state_dict" in pt_state:
+        pt_state = pt_state["state_dict"]
+
+    remove_shared_pointers(pt_state)
+
+    # Tensors need to be contiguous
+    pt_state = {k: v.contiguous() for k, v in pt_state.items()}
+
+    st_file.parent.mkdir(parents=True, exist_ok=True)
+    save_file(pt_state, str(st_file), metadata={"format": "pt"})
+
+    # Check that both files are close in size
+    check_file_size(pt_file, st_file)
+
+    # Load safetensors state
+    st_state = load_file(str(st_file))
+    for k in st_state:
+        pt_tensor = pt_state[k]
+        st_tensor = st_state[k]
+        if not torch.equal(pt_tensor, st_tensor):
+            raise RuntimeError(f"The output tensors do not match for key {k}")
+
+
+def convert_files(pt_files: List[Path], st_files: List[Path]):
+    assert len(pt_files) == len(st_files)
+
+    executor = ThreadPoolExecutor(max_workers=5)
+    futures = [
+        executor.submit(convert_file, pt_file=pt_file, st_file=st_file)
+        for pt_file, st_file in zip(pt_files, st_files)
+    ]
+
+    # We do this instead of using tqdm because we want to parse the logs with the launcher
+    start_time = time.time()
+    for i, future in enumerate(concurrent.futures.as_completed(futures)):
+        elapsed = timedelta(seconds=int(time.time() - start_time))
+        remaining = len(futures) - (i + 1)
+        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0
+
+        logger.info(f"Convert: [{i + 1}/{len(futures)}] -- ETA: {eta}")
diff --git a/server/text_generation_server/utils/dist.py b/server/text_generation_server/utils/dist.py
new file mode 100644
index 00000000..9785493e
--- /dev/null
+++ b/server/text_generation_server/utils/dist.py
@@ -0,0 +1,35 @@
+import os
+import torch
+
+from datetime import timedelta
+
+
+def initialize_torch_distributed():
+    rank = int(os.getenv("RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+
+    if torch.cuda.is_available():
+        from torch.distributed import ProcessGroupNCCL
+
+        # Set the device id.
+        assert world_size <= torch.cuda.device_count(), "Each process is one gpu"
+        device = rank % torch.cuda.device_count()
+        torch.cuda.set_device(device)
+        backend = "nccl"
+        options = ProcessGroupNCCL.Options()
+        options.is_high_priority_stream = True
+        options._timeout = timedelta(seconds=60)
+    else:
+        backend = "gloo"
+        options = None
+
+    # Call the init process.
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        timeout=timedelta(seconds=60),
+        pg_options=options,
+    )
+
+    return torch.distributed.group.WORLD, rank, world_size
diff --git a/server/text_generation_server/utils/hub.py b/server/text_generation_server/utils/hub.py
new file mode 100644
index 00000000..d338fb29
--- /dev/null
+++ b/server/text_generation_server/utils/hub.py
@@ -0,0 +1,165 @@
+import time
+import os
+
+from datetime import timedelta
+from loguru import logger
+from pathlib import Path
+from typing import Optional, List
+
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import (
+    LocalEntryNotFoundError,
+    EntryNotFoundError,
+    RevisionNotFoundError,  # Import here to ease try/except in other part of the lib
+)
+
+WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
+
+
+def weight_hub_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[str]:
+    """Get the weights filenames on the hub"""
+    api = HfApi()
+    info = api.model_info(model_id, revision=revision)
+    filenames = [s.rfilename for s in info.siblings if s.rfilename.endswith(extension)]
+
+    if not filenames:
+        raise EntryNotFoundError(
+            f"No {extension} weights found for model {model_id} and revision {revision}.",
+            None,
+        )
+
+    return filenames
+
+
+def try_to_load_from_cache(
+    model_id: str, revision: Optional[str], filename: str
+) -> Optional[Path]:
+    """Try to load a file from the Hugging Face cache"""
+    if revision is None:
+        revision = "main"
+
+    object_id = model_id.replace("/", "--")
+    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / f"models--{object_id}"
+
+    if not repo_cache.is_dir():
+        # No cache for this model
+        return None
+
+    refs_dir = repo_cache / "refs"
+    snapshots_dir = repo_cache / "snapshots"
+    no_exist_dir = repo_cache / ".no_exist"
+
+    # Resolve refs (for instance to convert main to the associated commit sha)
+    if refs_dir.is_dir():
+        revision_file = refs_dir / revision
+        if revision_file.exists():
+            with revision_file.open() as f:
+                revision = f.read()
+
+    # Check if file is cached as "no_exist"
+    if (no_exist_dir / revision / filename).is_file():
+        return None
+
+    # Check if revision folder exists
+    if not snapshots_dir.exists():
+        return None
+    cached_shas = os.listdir(snapshots_dir)
+    if revision not in cached_shas:
+        # No cache for this revision and we won't try to return a random revision
+        return None
+
+    # Check if file exists in cache
+    cached_file = snapshots_dir / revision / filename
+    return cached_file if cached_file.is_file() else None
+
+
+def weight_files(
+    model_id: str, revision: Optional[str] = None, extension: str = ".safetensors"
+) -> List[Path]:
+    """Get the local files"""
+    # Local model
+    if Path(model_id).exists() and Path(model_id).is_dir():
+        return list(Path(model_id).glob(f"*{extension}"))
+
+    try:
+        filenames = weight_hub_files(model_id, revision, extension)
+    except EntryNotFoundError as e:
+        if extension != ".safetensors":
+            raise e
+        # Try to see if there are pytorch weights
+        pt_filenames = weight_hub_files(model_id, revision, extension=".bin")
+        # Change pytorch extension to safetensors extension
+        # It is possible that we have safetensors weights locally even though they are not on the
+        # hub if we converted weights locally without pushing them
+        filenames = [
+            f"{Path(f).stem.lstrip('pytorch_')}.safetensors" for f in pt_filenames
+        ]
+
+    if WEIGHTS_CACHE_OVERRIDE is not None:
+        files = []
+        for filename in filenames:
+            p = Path(WEIGHTS_CACHE_OVERRIDE) / filename
+            if not p.exists():
+                raise LocalEntryNotFoundError(
+                    f"File {p} not found in {WEIGHTS_CACHE_OVERRIDE}."
+                )
+            files.append(p)
+        return files
+
+    files = []
+    for filename in filenames:
+        cache_file = try_to_load_from_cache(
+            model_id, revision=revision, filename=filename
+        )
+        if cache_file is None:
+            raise LocalEntryNotFoundError(
+                f"File {filename} of model {model_id} not found in "
+                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
+                f"Please run `text-generation-server download-weights {model_id}` first."
+            )
+        files.append(cache_file)
+
+    return files
+
+
+def download_weights(
+    filenames: List[str], model_id: str, revision: Optional[str] = None
+) -> List[Path]:
+    """Download the safetensors files from the hub"""
+
+    def download_file(filename):
+        local_file = try_to_load_from_cache(model_id, revision, filename)
+        if local_file is not None:
+            logger.info(f"File {filename} already present in cache.")
+            return Path(local_file)
+
+        logger.info(f"Download file: {filename}")
+        start_time = time.time()
+        local_file = hf_hub_download(
+            filename=filename,
+            repo_id=model_id,
+            revision=revision,
+            local_files_only=False,
+        )
+        logger.info(
+            f"Downloaded {local_file} in {timedelta(seconds=int(time.time() - start_time))}."
+        )
+        return Path(local_file)
+
+    # We do this instead of using tqdm because we want to parse the logs with the launcher
+    start_time = time.time()
+    files = []
+    for i, filename in enumerate(filenames):
+        file = download_file(filename)
+
+        elapsed = timedelta(seconds=int(time.time() - start_time))
+        remaining = len(filenames) - (i + 1)
+        eta = (elapsed / (i + 1)) * remaining if remaining > 0 else 0
+
+        logger.info(f"Download: [{i + 1}/{len(filenames)}] -- ETA: {eta}")
+        files.append(file)
+
+    return files
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
new file mode 100644
index 00000000..c7594644
--- /dev/null
+++ b/server/text_generation_server/utils/tokens.py
@@ -0,0 +1,160 @@
+import re
+import torch
+
+from transformers import (
+    LogitsProcessorList,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TypicalLogitsWarper,
+    RepetitionPenaltyLogitsProcessor,
+    PreTrainedTokenizerBase,
+)
+from typing import List, Tuple, Optional
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.pb.generate_pb2 import FinishReason
+from text_generation_server.utils.watermark import WatermarkLogitsProcessor
+
+
+class Sampling:
+    def __init__(self, seed: int, device: str = "cpu"):
+        self.generator = torch.Generator(device)
+        self.generator.manual_seed(seed)
+        self.seed = seed
+
+    def __call__(self, logits):
+        probs = torch.nn.functional.softmax(logits)
+        next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
+        return next_tokens
+
+
+class Greedy:
+    def __call__(self, logits):
+        return logits.argmax()
+
+
+class NextTokenChooser:
+    def __init__(
+        self,
+        watermark=False,
+        temperature=1.0,
+        repetition_penalty=1.0,
+        top_k=None,
+        top_p=None,
+        typical_p=None,
+        do_sample=False,
+        seed=0,
+        device="cpu",
+    ):
+        warpers = LogitsProcessorList()
+        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+        # all samplers can be found in `generation_utils_samplers.py`
+        sampling = do_sample
+
+        if watermark:
+            warpers.append(WatermarkLogitsProcessor(device=device))
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            warpers.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        if temperature is not None and temperature != 1.0:
+            temperature = float(temperature)
+            warpers.append(TemperatureLogitsWarper(temperature))
+            sampling = True
+        if top_k is not None and top_k != 0:
+            warpers.append(TopKLogitsWarper(top_k=top_k))
+            sampling = True
+        if top_p is not None and top_p < 1.0:
+            warpers.append(TopPLogitsWarper(top_p=top_p))
+            sampling = True
+        if typical_p is not None and typical_p < 1.0:
+            warpers.append(TypicalLogitsWarper(mass=typical_p))
+            sampling = True
+
+        self.warpers = warpers
+        self.choice = Sampling(seed, device) if sampling else Greedy()
+
+    def __call__(self, input_ids, scores):
+        # Warp logits
+        if scores.shape[0] > 1:
+            # only warp the last token logits
+            scores[-1:, :] = self.warpers(input_ids, scores[-1:, :])
+        else:
+            scores = self.warpers(input_ids, scores)
+
+        # Compute logprobs
+        logprobs = torch.log_softmax(scores, -1)
+
+        # Choose tokens
+        next_id = self.choice(scores[-1])
+
+        return next_id.view(1, 1), logprobs
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.NextTokenChooserParameters,
+        device: torch.device,
+    ) -> "NextTokenChooser":
+        return NextTokenChooser(
+            watermark=pb.watermark,
+            temperature=pb.temperature,
+            repetition_penalty=pb.repetition_penalty,
+            top_k=pb.top_k,
+            top_p=pb.top_p,
+            typical_p=pb.typical_p,
+            do_sample=pb.do_sample,
+            seed=pb.seed,
+            device=device,
+        )
+
+
+class StopSequenceCriteria:
+    def __init__(self, stop_sequence: str):
+        self.regex = re.compile(f".*{stop_sequence}$")
+
+    def __call__(self, output: str) -> bool:
+        if self.regex.findall(output):
+            return True
+        return False
+
+
+class StoppingCriteria:
+    def __init__(
+        self,
+        eos_token_id: int,
+        stop_sequence_criterias: List[StopSequenceCriteria],
+        max_new_tokens=20,
+    ):
+        self.eos_token_id = eos_token_id
+        self.stop_sequence_criterias = stop_sequence_criterias
+        self.max_new_tokens = max_new_tokens
+        self.current_tokens = 0
+        self.current_output = ""
+
+    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
+        self.current_tokens += 1
+        if self.current_tokens >= self.max_new_tokens:
+            return True, FinishReason.FINISH_REASON_LENGTH
+
+        if last_token == self.eos_token_id:
+            return True, FinishReason.FINISH_REASON_EOS_TOKEN
+
+        self.current_output += last_output
+        for stop_sequence_criteria in self.stop_sequence_criterias:
+            if stop_sequence_criteria(self.current_output):
+                return True, FinishReason.FINISH_REASON_STOP_SEQUENCE
+
+        return False, None
+
+    @classmethod
+    def from_pb(
+        cls,
+        pb: generate_pb2.StoppingCriteriaParameters,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> "StoppingCriteria":
+        stop_sequence_criterias = [
+            StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
+        ]
+        return StoppingCriteria(
+            tokenizer.eos_token_id, stop_sequence_criterias, pb.max_new_tokens
+        )
diff --git a/server/text_generation_server/utils/watermark.py b/server/text_generation_server/utils/watermark.py
new file mode 100644
index 00000000..8e90a59c
--- /dev/null
+++ b/server/text_generation_server/utils/watermark.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2023 Authors of "A Watermark for Large Language Models"
+# available at https://arxiv.org/abs/2301.10226
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+from transformers import LogitsProcessor
+
+GAMMA = os.getenv("WATERMARK_GAMMA", 0.5)
+DELTA = os.getenv("WATERMARK_DELTA", 2.0)
+
+
+class WatermarkLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        gamma: float = GAMMA,
+        delta: float = DELTA,
+        hash_key: int = 15485863,  # just a large prime number to create a rng seed with sufficient bit width
+        device: str = "cpu",
+    ):
+        # watermarking parameters
+        self.gamma = gamma
+        self.delta = delta
+        self.rng = torch.Generator(device=device)
+        self.hash_key = hash_key
+
+    def _seed_rng(self, input_ids: torch.LongTensor) -> None:
+        assert (
+            input_ids.shape[-1] >= 1
+        ), "requires at least a 1 token prefix sequence to seed rng"
+        prev_token = input_ids[-1].item()
+        self.rng.manual_seed(self.hash_key * prev_token)
+
+    def _get_greenlist_ids(
+        self, input_ids: torch.LongTensor, max_value: int
+    ) -> list[int]:
+        # seed the rng using the previous tokens/prefix
+        self._seed_rng(input_ids)
+
+        greenlist_size = int(max_value * self.gamma)
+        vocab_permutation = torch.randperm(
+            max_value, device=input_ids.device, generator=self.rng
+        )
+        greenlist_ids = vocab_permutation[:greenlist_size]
+        return greenlist_ids
+
+    @staticmethod
+    def _calc_greenlist_mask(
+        scores: torch.FloatTensor, greenlist_token_ids
+    ) -> torch.BoolTensor:
+        green_tokens_mask = torch.zeros_like(scores)
+        green_tokens_mask[-1, greenlist_token_ids] = 1
+        final_mask = green_tokens_mask.bool()
+        return final_mask
+
+    @staticmethod
+    def _bias_greenlist_logits(
+        scores: torch.Tensor, greenlist_mask: torch.Tensor, greenlist_bias: float
+    ) -> torch.Tensor:
+        scores[greenlist_mask] = scores[greenlist_mask] + greenlist_bias
+        return scores
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        assert len(input_ids) == 1
+        greenlist_ids = self._get_greenlist_ids(input_ids[0], scores.shape[-1])
+        green_tokens_mask = self._calc_greenlist_mask(
+            scores=scores, greenlist_token_ids=greenlist_ids
+        )
+
+        scores = self._bias_greenlist_logits(
+            scores=scores, greenlist_mask=green_tokens_mask, greenlist_bias=self.delta
+        )
+        return scores
diff --git a/supported_models.json b/supported_models.json
new file mode 100644
index 00000000..86d3bdfe
--- /dev/null
+++ b/supported_models.json
@@ -0,0 +1,9 @@
+[
+  "bigscience/bloom",
+  "bigscience/bloomz",
+  "EleutherAI/gpt-neox-20b",
+  "google/flan-ul2",
+  "google/flan-t5-xxl",
+  "OpenAssistant/oasst-sft-1-pythia-12b",
+  "olivierdehaene/optimized-santacoder"
+]