This commit is contained in:
OlivierDehaene 2023-12-22 15:46:04 +01:00 committed by Karol Damaszke
parent 8cc4306f72
commit 62646c2a54
8 changed files with 28 additions and 29 deletions

34
Cargo.lock generated
View File

@ -350,9 +350,9 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.94" version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17f6e324229dc011159fcc089755d1e2e216a90d43a7dea6853ca740b84f35e7" checksum = "d32a725bc159af97c3e629873bb9f88fb8cf8a4867175f76dc987815ea07c83b"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
@ -2340,9 +2340,9 @@ dependencies = [
[[package]] [[package]]
name = "rustix" name = "rustix"
version = "0.38.32" version = "0.38.33"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" checksum = "e3cc72858054fcff6d7dea32df2aeaee6a7c24227366d7ea429aada2f26b16ad"
dependencies = [ dependencies = [
"bitflags 2.5.0", "bitflags 2.5.0",
"errno", "errno",
@ -2365,9 +2365,9 @@ dependencies = [
[[package]] [[package]]
name = "rustls" name = "rustls"
version = "0.22.3" version = "0.22.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
dependencies = [ dependencies = [
"log", "log",
"ring 0.17.8", "ring 0.17.8",
@ -2586,9 +2586,9 @@ dependencies = [
[[package]] [[package]]
name = "signal-hook-registry" name = "signal-hook-registry"
version = "1.4.1" version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
dependencies = [ dependencies = [
"libc", "libc",
] ]
@ -2798,7 +2798,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-benchmark" name = "text-generation-benchmark"
version = "1.3.3" version = "1.3.4"
dependencies = [ dependencies = [
"average", "average",
"clap", "clap",
@ -2819,7 +2819,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-client" name = "text-generation-client"
version = "1.3.3" version = "1.3.4"
dependencies = [ dependencies = [
"futures", "futures",
"grpc-metadata", "grpc-metadata",
@ -2836,7 +2836,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-launcher" name = "text-generation-launcher"
version = "1.3.3" version = "1.3.4"
dependencies = [ dependencies = [
"clap", "clap",
"ctrlc", "ctrlc",
@ -2852,7 +2852,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router" name = "text-generation-router"
version = "1.3.3" version = "1.3.4"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"axum", "axum",
@ -2887,18 +2887,18 @@ dependencies = [
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.58" version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "1.0.58" version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -3409,7 +3409,7 @@ dependencies = [
"log", "log",
"native-tls", "native-tls",
"once_cell", "once_cell",
"rustls 0.22.3", "rustls 0.22.4",
"rustls-pki-types", "rustls-pki-types",
"rustls-webpki", "rustls-webpki",
"serde", "serde",

View File

@ -9,7 +9,7 @@ members = [
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "1.3.3" version = "1.3.4"
edition = "2021" edition = "2021"
authors = ["Olivier Dehaene"] authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference" homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -10,7 +10,7 @@
"name": "Apache 2.0", "name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0" "url": "https://www.apache.org/licenses/LICENSE-2.0"
}, },
"version": "1.3.3" "version": "1.3.4"
}, },
"paths": { "paths": {
"/": { "/": {

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "text-generation-integration-tests" name = "text-generation-integration-tests"
version = "1.3.3" version = "1.3.4"
description = "Text Generation Inference integration tests" description = "Text Generation Inference integration tests"
authors = ["Nicolas Patry <nicolas@huggingface.co>"] authors = ["Nicolas Patry <nicolas@huggingface.co>"]

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "text-generation-server" name = "text-generation-server"
version = "1.3.3" version = "1.3.4"
description = "Text Generation Inference Python gRPC Server" description = "Text Generation Inference Python gRPC Server"
authors = ["Olivier Dehaene <olivier@huggingface.co>"] authors = ["Olivier Dehaene <olivier@huggingface.co>"]

View File

@ -39,7 +39,7 @@ if V2 and int(os.getenv("WORLD_SIZE", "1")) > 1:
V2 = False V2 = False
log_once( log_once(
logger.warning, logger.warning,
"Disabling exllama v2 and using v1 instead because there are issues when sharding" "Disabling exllama v2 and using v1 instead because there are issues when sharding",
) )
if os.getenv("DISABLE_EXLLAMA") == "True": if os.getenv("DISABLE_EXLLAMA") == "True":

View File

@ -2,5 +2,5 @@ from functools import lru_cache
@lru_cache(10) @lru_cache(10)
def log_once(log, msg:str): def log_once(log, msg: str):
log(msg) log(msg)

View File

@ -215,7 +215,9 @@ class Weights:
bits, groupsize, desc_act = self._get_gptq_params() bits, groupsize, desc_act = self._get_gptq_params()
from text_generation_server.utils.layers import HAS_EXLLAMA from text_generation_server.utils.layers import HAS_EXLLAMA
use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act use_exllama = (
bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
)
weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama) weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
else: else:
w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes] w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
@ -281,14 +283,11 @@ class Weights:
if CAN_EXLLAMA: if CAN_EXLLAMA:
log_once( log_once(
logger.warning, logger.warning,
"Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True" "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
) )
use_exllama = False use_exllama = False
else: else:
log_once( log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
logger.info,
f"Using exllama kernels v{HAS_EXLLAMA}"
)
g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0) g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)