diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index a0f1d6f1..4d0b19a3 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -17,5 +17,4 @@ jobs:
package: text-generation-inference
additional_args: --not_python_module
secrets:
- token: ${{ secrets.HUGGINGFACE_PUSH }}
- hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
\ No newline at end of file
+ hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/Cargo.lock b/Cargo.lock
index 8fa7b726..b1f7279a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -743,18 +743,6 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853"
-[[package]]
-name = "flume"
-version = "0.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
-dependencies = [
- "futures-core",
- "futures-sink",
- "nanorand",
- "spin 0.9.8",
-]
-
[[package]]
name = "fnv"
version = "1.0.7"
@@ -900,10 +888,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427"
dependencies = [
"cfg-if",
- "js-sys",
"libc",
"wasi",
- "wasm-bindgen",
]
[[package]]
@@ -1508,15 +1494,6 @@ dependencies = [
"tracing",
]
-[[package]]
-name = "nanorand"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3"
-dependencies = [
- "getrandom",
-]
-
[[package]]
name = "native-tls"
version = "0.2.11"
@@ -2313,7 +2290,7 @@ dependencies = [
"cc",
"libc",
"once_cell",
- "spin 0.5.2",
+ "spin",
"untrusted",
"web-sys",
"winapi",
@@ -2678,15 +2655,6 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-dependencies = [
- "lock_api",
-]
-
[[package]]
name = "spm_precompiled"
version = "0.1.4"
@@ -2808,7 +2776,7 @@ dependencies = [
[[package]]
name = "text-generation-benchmark"
-version = "1.1.0"
+version = "1.1.1"
dependencies = [
"average",
"clap",
@@ -2829,7 +2797,7 @@ dependencies = [
[[package]]
name = "text-generation-client"
-version = "1.1.0"
+version = "1.1.1"
dependencies = [
"futures",
"grpc-metadata",
@@ -2845,7 +2813,7 @@ dependencies = [
[[package]]
name = "text-generation-launcher"
-version = "1.1.0"
+version = "1.1.1"
dependencies = [
"clap",
"ctrlc",
@@ -2861,13 +2829,12 @@ dependencies = [
[[package]]
name = "text-generation-router"
-version = "1.1.0"
+version = "1.1.1"
dependencies = [
"async-stream",
"axum",
"axum-tracing-opentelemetry",
"clap",
- "flume",
"futures",
"hf-hub 0.3.1",
"init-tracing-opentelemetry",
@@ -2885,6 +2852,7 @@ dependencies = [
"thiserror",
"tokenizers",
"tokio",
+ "tokio-stream",
"tower-http",
"tracing",
"tracing-opentelemetry",
diff --git a/docs/source/basic_tutorials/preparing_model.md b/docs/source/basic_tutorials/preparing_model.md
index 97c9bbe0..56124a3b 100644
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@@ -4,7 +4,7 @@ Text Generation Inference improves the model in several aspects.
## Quantization
-TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./conceptual/quantization.md)
+TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsandbytes), [GPT-Q](https://arxiv.org/abs/2210.17323) and [AWQ](https://arxiv.org/abs/2306.00978) quantization. To speed up inference with quantization, simply set `quantize` flag to `bitsandbytes`, `gptq` or `awq` depending on the quantization technique you wish to use. When using GPT-Q quantization, you need to point to one of the models [here](https://huggingface.co/models?search=gptq) when using AWQ quantization, you need to point to one of the models [here](https://huggingface.co/models?search=awq). To get more information about quantization, please refer to [quantization guide](./../conceptual/quantization.md)
## RoPE Scaling
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 87b5a8d3..55af635a 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -20,7 +20,6 @@ axum = { version = "0.6.20", features = ["json"] }
axum-tracing-opentelemetry = "0.14.1"
text-generation-client = { path = "client" }
clap = { version = "4.4.5", features = ["derive", "env"] }
-flume = "0.11.0"
futures = "0.3.28"
metrics = "0.21.1"
metrics-exporter-prometheus = { version = "0.12.1", features = [] }
@@ -34,6 +33,7 @@ serde_json = "1.0.107"
thiserror = "1.0.48"
tokenizers = { version = "0.14.0", features = ["http"] }
tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokio-stream = "0.1.14"
tower-http = { version = "0.4.4", features = ["cors"] }
tracing = "0.1.37"
tracing-opentelemetry = "0.21.0"
diff --git a/router/client/src/client.rs b/router/client/src/client.rs
index d427d3a4..341e70fd 100644
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@@ -103,17 +103,18 @@ impl Client {
&mut self,
max_input_length: u32,
max_prefill_tokens: u32,
+ max_total_tokens: u32,
) -> Result