v2.0.0 (#1736)

2025-09-11 12:24:53 +00:00 · 2024-04-12 18:38:34 +02:00 · 2024-04-12 18:38:34 +02:00 · c6a31b9e2b
commit c6a31b9e2b
parent 6ad5aa7180
13 changed files with 32 additions and 18 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3406,7 +3406,7 @@ dependencies = [

 [[package]]
 name = "text-generation-benchmark"
-version = "1.4.5"
+version = "2.0.0"
 dependencies = [
 "average",
 "clap",
@ -3427,7 +3427,7 @@ dependencies = [

 [[package]]
 name = "text-generation-client"
-version = "1.4.5"
+version = "2.0.0"
 dependencies = [
 "futures",
 "grpc-metadata",
@ -3444,7 +3444,7 @@ dependencies = [

 [[package]]
 name = "text-generation-launcher"
-version = "1.4.5"
+version = "2.0.0"
 dependencies = [
 "clap",
 "ctrlc",
@ -3462,7 +3462,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router"
-version = "1.4.5"
+version = "2.0.0"
 dependencies = [
 "async-stream",
 "axum",
@ -4657,9 +4657,9 @@ dependencies = [

 [[package]]
 name = "zeroize"
-version = "1.8.0"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63381fa6624bf92130a6b87c0d07380116f80b565c42cf0d754136f0238359ef"
+checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"

 [[package]]
 name = "zip"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,7 +9,7 @@ members = [
 resolver = "2"

 [workspace.package]
-version = "1.4.5"
+version = "2.0.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "1.4.5"
+    "version": "2.0.0"
  },
  "paths": {
    "/": {
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_no_tools.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_no_tools.json
@ -17,7 +17,7 @@
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
  "usage": {
    "completion_tokens": 100,
    "prompt_tokens": 60,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools.json
@ -31,7 +31,7 @@
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 316,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_auto.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_auto.json
@ -31,7 +31,7 @@
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 316,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_choice.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_choice.json
@ -30,7 +30,7 @@
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native",
+  "system_fingerprint": "2.0.0-native",
  "usage": {
    "completion_tokens": 21,
    "prompt_tokens": 187,
--- a/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_stream.json
+++ b/integration-tests/models/snapshots/test_tools_llama/test_flash_llama_grammar_tools_stream.json
@ -23,5 +23,5 @@
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
-  "system_fingerprint": "1.4.5-native"
+  "system_fingerprint": "2.0.0-native"
 }
--- a/integration-tests/pyproject.toml
+++ b/integration-tests/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.4.5"
+version = "2.0.0"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]

--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.4.5"
+version = "2.0.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]

--- a/server/text_generation_server/interceptor.py
+++ b/server/text_generation_server/interceptor.py
@ -28,6 +28,10 @@ class ExceptionInterceptor(AsyncServerInterceptor):
            method_name = method_name.split("/")[-1]
            logger.exception(f"Method {method_name} encountered an error.")

+            # Runtime Error cannot be recovered from
+            if isinstance(err, RuntimeError):
+                exit(1)
+
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@ -55,9 +55,10 @@ class CacheManager:
    ):
        # Get free blocks indices by finding values in mask that are not set to 0
        free_block_indices = self.free_block_mask.nonzero()
-        assert (
-            len(free_block_indices) >= blocks
-        ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
+        if blocks > len(free_block_indices):
+            raise RuntimeError(
+                f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks"
+            )

        # Slice by the number of required blocks
        block_indices = free_block_indices[:blocks]
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -503,6 +503,10 @@ class MedusaHeadV1(nn.Module):
        self, input: torch.Tensor
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
        speculative_logits = self.medusa(input)
        return logits, speculative_logits

@ -549,6 +553,11 @@ class MedusaHeadV2(nn.Module):
        self.lm_head = TensorParallelHead.load(config, prefix, weights)

    def forward(self, x):
+        # If we have too many tokens, we skip speculative logits
+        if x.shape[0] > 128:
+            logits = self.lm_head(x)
+            return logits, None
+
        size = x.shape[-1]
        block_size = (size + self.world_size - 1) // self.world_size
        start = self.rank * block_size