diff --git a/Cargo.lock b/Cargo.lock index ca041d98..e5f47197 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3406,7 +3406,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "1.4.5" +version = "2.0.0" dependencies = [ "average", "clap", @@ -3427,7 +3427,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "1.4.5" +version = "2.0.0" dependencies = [ "futures", "grpc-metadata", @@ -3444,7 +3444,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "1.4.5" +version = "2.0.0" dependencies = [ "clap", "ctrlc", @@ -3462,7 +3462,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "1.4.5" +version = "2.0.0" dependencies = [ "async-stream", "axum", @@ -4657,9 +4657,9 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63381fa6624bf92130a6b87c0d07380116f80b565c42cf0d754136f0238359ef" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" [[package]] name = "zip" diff --git a/Cargo.toml b/Cargo.toml index a9e7f1c8..ecb4878f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "1.4.5" +version = "2.0.0" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/docs/openapi.json b/docs/openapi.json index fdf1c804..34b030f2 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.4.5" + "version": "2.0.0" }, "paths": { "/": { diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json index 543be115..153a508d 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json @@ -17,7 +17,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 100, "prompt_tokens": 60, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json index 728e90a4..56920b3e 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json @@ -31,7 +31,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 29, "prompt_tokens": 316, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json index 2e0efb86..fe679362 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json @@ -31,7 +31,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 29, "prompt_tokens": 316, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json index 91854223..e48a1e7d 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json @@ -30,7 +30,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 21, "prompt_tokens": 187, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json index e0c7aed6..cfebc05f 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json @@ -23,5 +23,5 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native" + "system_fingerprint": "2.0.0-native" } diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index ad217072..6923ff23 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-integration-tests" -version = "1.4.5" +version = "2.0.0" description = "Text Generation Inference integration tests" authors = ["Nicolas Patry "] diff --git a/server/pyproject.toml b/server/pyproject.toml index 6f892c14..57f3590e 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "1.4.5" +version = "2.0.0" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "] diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py index cde71de3..05339282 100644 --- a/server/text_generation_server/interceptor.py +++ b/server/text_generation_server/interceptor.py @@ -28,6 +28,10 @@ class ExceptionInterceptor(AsyncServerInterceptor): method_name = method_name.split("/")[-1] logger.exception(f"Method {method_name} encountered an error.") + # Runtime Error cannot be recovered from + if isinstance(err, RuntimeError): + exit(1) + if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py index 4be8b1b9..85e1b19b 100644 --- a/server/text_generation_server/models/cache_manager.py +++ b/server/text_generation_server/models/cache_manager.py @@ -55,9 +55,10 @@ class CacheManager: ): # Get free blocks indices by finding values in mask that are not set to 0 free_block_indices = self.free_block_mask.nonzero() - assert ( - len(free_block_indices) >= blocks - ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks" + if blocks > len(free_block_indices): + raise RuntimeError( + f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks" + ) # Slice by the number of required blocks block_indices = free_block_indices[:blocks] diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 2c12984b..9cf5c80f 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -503,6 +503,10 @@ class MedusaHeadV1(nn.Module): self, input: torch.Tensor ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: logits = self.lm_head(input) + # If we have too many tokens, we skip speculative logits + if input.shape[0] > 128: + return logits, None + speculative_logits = self.medusa(input) return logits, speculative_logits @@ -549,6 +553,11 @@ class MedusaHeadV2(nn.Module): self.lm_head = TensorParallelHead.load(config, prefix, weights) def forward(self, x): + # If we have too many tokens, we skip speculative logits + if x.shape[0] > 128: + logits = self.lm_head(x) + return logits, None + size = x.shape[-1] block_size = (size + self.world_size - 1) // self.world_size start = self.rank * block_size