diff --git a/Cargo.lock b/Cargo.lock index 7565d7da..39fb381a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -350,9 +350,9 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" [[package]] name = "cc" -version = "1.0.94" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" +checksum = "d32a725bc159af97c3e629873bb9f88fb8cf8a4867175f76dc987815ea07c83b" [[package]] name = "cfg-if" @@ -2340,9 +2340,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.32" +version = "0.38.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" +checksum = "e3cc72858054fcff6d7dea32df2aeaee6a7c24227366d7ea429aada2f26b16ad" dependencies = [ "bitflags 2.5.0", "errno", @@ -2365,9 +2365,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.3" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" dependencies = [ "log", "ring 0.17.8", @@ -2586,9 +2586,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" dependencies = [ "libc", ] @@ -2798,7 +2798,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "1.3.3" +version = "1.3.4" dependencies = [ "average", "clap", @@ -2819,7 +2819,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "1.3.3" +version = "1.3.4" dependencies = [ "futures", "grpc-metadata", @@ -2836,7 +2836,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "1.3.3" +version = "1.3.4" dependencies = [ "clap", "ctrlc", @@ -2852,7 +2852,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "1.3.3" +version = "1.3.4" dependencies = [ "async-stream", "axum", @@ -2887,18 +2887,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66" dependencies = [ "proc-macro2", "quote", @@ -3409,7 +3409,7 @@ dependencies = [ "log", "native-tls", "once_cell", - "rustls 0.22.3", + "rustls 0.22.4", "rustls-pki-types", "rustls-webpki", "serde", diff --git a/Cargo.toml b/Cargo.toml index a09a8ca7..80e6e145 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "1.3.3" +version = "1.3.4" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/docs/openapi.json b/docs/openapi.json index 62751928..df2d427f 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.3.3" + "version": "1.3.4" }, "paths": { "/": { diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index 64723ae6..f6929587 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-integration-tests" -version = "1.3.3" +version = "1.3.4" description = "Text Generation Inference integration tests" authors = ["Nicolas Patry "] diff --git a/server/pyproject.toml b/server/pyproject.toml index a027ba2b..d6806848 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "1.3.3" +version = "1.3.4" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "] diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 6648b55a..d4fa2559 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -39,7 +39,7 @@ if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1: V2 = False log_once( logger.warning, - "Disabling exllama v2 and using v1 instead because there are issues when sharding" + "Disabling exllama v2 and using v1 instead because there are issues when sharding", ) if os.getenv("DISABLE_EXLLAMA") == "True": diff --git a/server/text_generation_server/utils/log.py b/server/text_generation_server/utils/log.py index d831fa76..b1456f1e 100644 --- a/server/text_generation_server/utils/log.py +++ b/server/text_generation_server/utils/log.py @@ -2,5 +2,5 @@ from functools import lru_cache @lru_cache(10) -def log_once(log, msg:str): +def log_once(log, msg: str): log(msg) diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index ee1899ab..c4e82a6d 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -215,7 +215,9 @@ class Weights: bits, groupsize, desc_act = self._get_gptq_params() from text_generation_server.utils.layers import HAS_EXLLAMA - use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act + use_exllama = ( + bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act + ) weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama) else: w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes] @@ -281,14 +283,11 @@ class Weights: if CAN_EXLLAMA: log_once( logger.warning, - "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True" + "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True", ) use_exllama = False else: - log_once( - logger.info, - f"Using exllama kernels v{HAS_EXLLAMA}" - ) + log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}") g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)