mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 08:22:07 +00:00
v2.0.0 (#1736)
This commit is contained in:
parent
6ad5aa7180
commit
c6a31b9e2b
12
Cargo.lock
generated
12
Cargo.lock
generated
@ -3406,7 +3406,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-benchmark"
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
dependencies = [
|
||||
"average",
|
||||
"clap",
|
||||
@ -3427,7 +3427,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-client"
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"grpc-metadata",
|
||||
@ -3444,7 +3444,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-launcher"
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"ctrlc",
|
||||
@ -3462,7 +3462,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "text-generation-router"
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"axum",
|
||||
@ -4657,9 +4657,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.8.0"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "63381fa6624bf92130a6b87c0d07380116f80b565c42cf0d754136f0238359ef"
|
||||
checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
|
@ -9,7 +9,7 @@ members = [
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
edition = "2021"
|
||||
authors = ["Olivier Dehaene"]
|
||||
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||
|
@ -10,7 +10,7 @@
|
||||
"name": "Apache 2.0",
|
||||
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
||||
},
|
||||
"version": "1.4.5"
|
||||
"version": "2.0.0"
|
||||
},
|
||||
"paths": {
|
||||
"/": {
|
||||
|
@ -17,7 +17,7 @@
|
||||
"id": "",
|
||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"object": "text_completion",
|
||||
"system_fingerprint": "1.4.5-native",
|
||||
"system_fingerprint": "2.0.0-native",
|
||||
"usage": {
|
||||
"completion_tokens": 100,
|
||||
"prompt_tokens": 60,
|
||||
|
@ -31,7 +31,7 @@
|
||||
"id": "",
|
||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"object": "text_completion",
|
||||
"system_fingerprint": "1.4.5-native",
|
||||
"system_fingerprint": "2.0.0-native",
|
||||
"usage": {
|
||||
"completion_tokens": 29,
|
||||
"prompt_tokens": 316,
|
||||
|
@ -31,7 +31,7 @@
|
||||
"id": "",
|
||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"object": "text_completion",
|
||||
"system_fingerprint": "1.4.5-native",
|
||||
"system_fingerprint": "2.0.0-native",
|
||||
"usage": {
|
||||
"completion_tokens": 29,
|
||||
"prompt_tokens": 316,
|
||||
|
@ -30,7 +30,7 @@
|
||||
"id": "",
|
||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"object": "text_completion",
|
||||
"system_fingerprint": "1.4.5-native",
|
||||
"system_fingerprint": "2.0.0-native",
|
||||
"usage": {
|
||||
"completion_tokens": 21,
|
||||
"prompt_tokens": 187,
|
||||
|
@ -23,5 +23,5 @@
|
||||
"id": "",
|
||||
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"object": "text_completion",
|
||||
"system_fingerprint": "1.4.5-native"
|
||||
"system_fingerprint": "2.0.0-native"
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "text-generation-integration-tests"
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
description = "Text Generation Inference integration tests"
|
||||
authors = ["Nicolas Patry <nicolas@huggingface.co>"]
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "text-generation-server"
|
||||
version = "1.4.5"
|
||||
version = "2.0.0"
|
||||
description = "Text Generation Inference Python gRPC Server"
|
||||
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
||||
|
||||
|
@ -28,6 +28,10 @@ class ExceptionInterceptor(AsyncServerInterceptor):
|
||||
method_name = method_name.split("/")[-1]
|
||||
logger.exception(f"Method {method_name} encountered an error.")
|
||||
|
||||
# Runtime Error cannot be recovered from
|
||||
if isinstance(err, RuntimeError):
|
||||
exit(1)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
@ -55,9 +55,10 @@ class CacheManager:
|
||||
):
|
||||
# Get free blocks indices by finding values in mask that are not set to 0
|
||||
free_block_indices = self.free_block_mask.nonzero()
|
||||
assert (
|
||||
len(free_block_indices) >= blocks
|
||||
), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
|
||||
if blocks > len(free_block_indices):
|
||||
raise RuntimeError(
|
||||
f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
|
||||
)
|
||||
|
||||
# Slice by the number of required blocks
|
||||
block_indices = free_block_indices[:blocks]
|
||||
|
@ -503,6 +503,10 @@ class MedusaHeadV1(nn.Module):
|
||||
self, input: torch.Tensor
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
|
||||
logits = self.lm_head(input)
|
||||
# If we have too many tokens, we skip speculative logits
|
||||
if input.shape[0] > 128:
|
||||
return logits, None
|
||||
|
||||
speculative_logits = self.medusa(input)
|
||||
return logits, speculative_logits
|
||||
|
||||
@ -549,6 +553,11 @@ class MedusaHeadV2(nn.Module):
|
||||
self.lm_head = TensorParallelHead.load(config, prefix, weights)
|
||||
|
||||
def forward(self, x):
|
||||
# If we have too many tokens, we skip speculative logits
|
||||
if x.shape[0] > 128:
|
||||
logits = self.lm_head(x)
|
||||
return logits, None
|
||||
|
||||
size = x.shape[-1]
|
||||
block_size = (size + self.world_size - 1) // self.world_size
|
||||
start = self.rank * block_size
|
||||
|
Loading…
Reference in New Issue
Block a user