Merge branch 'main' into moe

2025-09-09 19:34:53 +00:00 · 2024-11-04 17:53:13 -08:00 · 2024-11-04 17:53:13 -08:00 · 3d4c50f028
commit 3d4c50f028
parent 1108051605 b1f9044d6c
9 changed files with 84 additions and 25 deletions
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -101,6 +101,47 @@
        }
      }
    },
+    "/chat_tokenize": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Template and tokenize ChatRequest",
+        "operationId": "get_chat_tokenize",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/ChatRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Templated and tokenized ChatRequest",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ChatTokenizeResponse"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "Failed to tokenize ChatRequest",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
    "/generate": {
      "post": {
        "tags": [
@ -1092,6 +1133,21 @@
          }
        }
      },
+      "ChatTokenizeResponse": {
+        "type": "object",
+        "required": [
+          "tokenize_response",
+          "templated_text"
+        ],
+        "properties": {
+          "templated_text": {
+            "type": "string"
+          },
+          "tokenize_response": {
+            "$ref": "#/components/schemas/TokenizeResponse"
+          }
+        }
+      },
      "Chunk": {
        "type": "object",
        "required": [
--- a/flake.lock
+++ b/flake.lock
@ -978,16 +978,15 @@
        "nixpkgs": "nixpkgs_6"
      },
      "locked": {
-        "lastModified": 1729761651,
-        "narHash": "sha256-GYykQ9Fxji2EuXCGcPn0dx8Qx8VQBJTkRdcCytp4A/k=",
+        "lastModified": 1730724647,
+        "narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
        "owner": "huggingface",
        "repo": "text-generation-inference-nix",
-        "rev": "f7e3c4fa67d70590ed9ee47feeab645bd9ba81b1",
+        "rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
        "type": "github"
      },
      "original": {
        "owner": "huggingface",
-        "ref": "marlin-kernels-0.3.1",
        "repo": "text-generation-inference-nix",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@ -5,7 +5,7 @@
      inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
    };
    nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.1";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
    nixpkgs.follows = "tgi-nix/nixpkgs";
    flake-utils.url = "github:numtide/flake-utils";
    rust-overlay = {
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1687,13 +1687,6 @@ fn main() -> Result<(), LauncherError> {
    let max_position_embeddings = if let Some(config) = &config {
        if let Some(max_position_embeddings) = config.max_position_embeddings {
            if max_position_embeddings > max_default {
-                let max = max_position_embeddings;
-                if args.max_input_tokens.is_none()
-                    && args.max_total_tokens.is_none()
-                    && args.max_batch_prefill_tokens.is_none()
-                {
-                    tracing::info!("Model supports up to {max} but tgi will now set its default to {max_default} instead. This is to save VRAM by refusing large prompts in order to allow more users on the same hardware. You can increase that size using `--max-batch-prefill-tokens={} --max-total-tokens={max} --max-input-tokens={}`.", max + 50, max - 1);
-                }
                max_default
            } else {
                max_position_embeddings
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -181,12 +181,16 @@ async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
    })
 }

+/// Template and tokenize ChatRequest
 #[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/chat_tokenize",
    request_body = ChatRequest,
-    responses((status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse))
+    responses(
+    (status = 200, description = "Templated and tokenized ChatRequest", body = ChatTokenizeResponse),
+    (status = 404, description = "Failed to tokenize ChatRequest", body = ErrorResponse),
+    )
 )]
 async fn get_chat_tokenize(
    Extension(infer): Extension<Infer>,
@ -1501,6 +1505,7 @@ tokenize,
 metrics,
 openai_get_model_info,
 sagemaker_compatibility,
+get_chat_tokenize,
 ),
 components(
 schemas(
@ -1558,6 +1563,7 @@ Function,
 FunctionDefinition,
 ToolChoice,
 ModelInfo,
+ChatTokenizeResponse,
 )
 ),
 tags(
--- a/server/text_generation_server/layers/awq/quantize/ipex.py
+++ b/server/text_generation_server/layers/awq/quantize/ipex.py
@ -44,5 +44,4 @@ class WQLinear(nn.Module):
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.out_features,)
        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
-        out = out + self.bias if self.bias is not None else out
        return out.reshape(out_shape)
--- a/server/text_generation_server/layers/gptq/ipex.py
+++ b/server/text_generation_server/layers/gptq/ipex.py
@ -122,5 +122,4 @@ class QuantLinear(nn.Module):
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
-        out = out + self.bias if self.bias is not None else out
        return out.reshape(out_shape)
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -887,11 +887,12 @@ class FlashCausalLMBatch(Batch):
            fsm_grammar_states=fsm_grammar_states,
        )

-        speculative_ids = (
-            torch.cat([b.speculative_ids for b in batches], dim=0)
-            if batches[0].speculative_ids is not None
-            else None
-        )
+        # We skip computing the speculative_ids when the batch size is too large, so
+        # we must check that all batches have them, otherwise they must be discarded
+        if get_speculate() > 0 and all(b.speculative_ids is not None for b in batches):
+            speculative_ids = torch.cat([b.speculative_ids for b in batches], dim=0)
+        else:
+            speculative_ids = None

        if adapter_segment_builder is not None:
            adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
@ -1532,8 +1533,6 @@ class FlashCausalLM(Model):
                self.kv_cache_dtype,
                self.device,
            )
-            max_bt = batch.max_blocks
-            max_s = max_bt * BLOCK_SIZE
            batch_num_blocks = batch.num_blocks

            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
@ -1651,7 +1650,7 @@ class FlashCausalLM(Model):
                # Warmup cuda graphs
                for bs in CUDA_GRAPHS:
                    if self.speculate is None or self.speculate + 1 <= bs:
-                        self.cuda_graph_warmup(bs, max_s, max_bt)
+                        self.cuda_graph_warmup(bs, max_total_tokens, max_total_tokens)
            except torch.cuda.OutOfMemoryError:
                logger.exception("Decode cuda graph warmup failed")
        else:
@ -1726,7 +1725,15 @@ class FlashCausalLM(Model):
            new_position_ids = (
                position_ids.unsqueeze(-1).expand(B, new_length) + arange
            ).view(-1)
-            slots = (slots.unsqueeze(-1).expand(B, new_length) + arange_int).view(-1)
+
+            # Slots can be discontiguous when prefix caching is enabled, so we need to expand the slot_indices,
+            # then update the slots with the additional indices to ensure we're grabbing the ones that have been
+            # allocated
+            slot_indices = (
+                batch.slot_indices.unsqueeze(-1).expand(B, new_length) + arange_int
+            ).view(-1)
+            slots = batch.slots[slot_indices]
+
            input_lengths = (
                input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
            ).view(-1)
--- a/server/text_generation_server/models/metadata_kernels.py
+++ b/server/text_generation_server/models/metadata_kernels.py
@ -55,7 +55,7 @@ def block_tables_to_ragged(
    cache_lengths: List[int],
    input_lengths_tensor: torch.Tensor,
    cache_lengths_tensor: torch.Tensor,
-    max_current_length: int
+    max_current_length: int,
 ) -> torch.Tensor:
    """Convert block table to ragged format compatible with FlashInfer."""
    assert len(input_lengths) == len(cache_lengths)