Compare commits

..

6 Commits
v3.3.5 ... main

Author SHA1 Message Date
drbh
1b90c508af
Revert "Revert "feat: bump flake including transformers and huggingfa… (#3326)
Revert "Revert "feat: bump flake including transformers and huggingface_hub v…"

This reverts commit 9dedeb89ac.
2025-09-09 10:44:25 -04:00
Eliott C.
d2ad7c484e
Update iframe sources for streaming demo (#3327) 2025-09-09 15:36:19 +02:00
Daniël de Kok
c6071749db
Fix mask passed to flashinfer (#3324)
Custom masks are padded to the shape `[batch_size, max_len, max_len]`.
However, flashinfer expects an unpadded mask of the shape
`[sum(q_len[i] * k_len[i] for i in range(batch_size)]`.

This change unpads the custom mask (currently only used by Gemma 3)
to this shape (assuming q_len == k_len, since we only use the custom
mask during prefill).
2025-09-08 13:47:03 -04:00
drbh
4f067c22c3
fix: remove azure (#3325) 2025-09-08 13:41:45 -04:00
drbh
9dedeb89ac
Revert "feat: bump flake including transformers and huggingface_hub versions" (#3323)
Revert "feat: bump flake including transformers and huggingface_hub versions …"

This reverts commit 356de85c29.
2025-09-08 12:17:29 +02:00
Phil
5739b5b088
Add missing backslash (#3311) 2025-09-06 09:50:14 +02:00
16 changed files with 47 additions and 31 deletions

View File

@ -175,13 +175,6 @@ jobs:
registry: docker.io
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Login to Azure Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
# If pull request
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name == 'pull_request' }}
@ -203,7 +196,6 @@ jobs:
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}

16
Cargo.lock generated
View File

@ -4650,7 +4650,7 @@ dependencies = [
[[package]]
name = "text-generation-backends-trtllm"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"async-trait",
"clap 4.5.32",
@ -4671,7 +4671,7 @@ dependencies = [
[[package]]
name = "text-generation-benchmark"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"average",
"clap 4.5.32",
@ -4691,7 +4691,7 @@ dependencies = [
[[package]]
name = "text-generation-client"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"async-trait",
"base64 0.22.1",
@ -4709,7 +4709,7 @@ dependencies = [
[[package]]
name = "text-generation-launcher"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"clap 4.5.32",
"ctrlc",
@ -4730,7 +4730,7 @@ dependencies = [
[[package]]
name = "text-generation-router"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"anyhow",
"async-stream",
@ -4782,7 +4782,7 @@ dependencies = [
[[package]]
name = "text-generation-router-llamacpp"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"async-trait",
"bindgen 0.71.1",
@ -4800,7 +4800,7 @@ dependencies = [
[[package]]
name = "text-generation-router-v2"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"async-stream",
"async-trait",
@ -4849,7 +4849,7 @@ dependencies = [
[[package]]
name = "text-generation-router-v3"
version = "3.3.5"
version = "3.3.5-dev0"
dependencies = [
"async-stream",
"async-trait",

View File

@ -21,7 +21,7 @@ default-members = [
resolver = "2"
[workspace.package]
version = "3.3.5"
version = "3.3.5-dev0"
edition = "2021"
authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -10,7 +10,7 @@
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "3.3.5"
"version": "3.3.5-dev0"
},
"paths": {
"/": {

View File

@ -83,7 +83,7 @@ docker run \
-e "HF_TOKEN=$HF_TOKEN" \
-v "$HOME/models:/app/models" \
tgi-llamacpp \
--n-gpu-layers 99
--n-gpu-layers 99 \
--model-id "Qwen/Qwen2.5-3B-Instruct"
```

View File

@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
<div class="block dark:hidden">
<iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
width="850"
height="350"
></iframe>
</div>
<div class="hidden dark:block">
<iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
width="850"
height="350"
></iframe>

View File

@ -17,7 +17,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 42,
"prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 62,
"prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 67,
"prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 72,
"prompt_tokens": 275,

View File

@ -17,7 +17,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 80,
"prompt_tokens": 279,

View File

@ -14,7 +14,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 35,
"prompt_tokens": 32,

View File

@ -14,7 +14,7 @@
"id": "",
"model": "google/gemma-3-4b-it",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 44,
"prompt_tokens": 37,

View File

@ -18,7 +18,7 @@
"id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 45,
@ -44,7 +44,7 @@
"id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 45,

View File

@ -17,7 +17,7 @@
"id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion",
"system_fingerprint": "3.3.5-native",
"system_fingerprint": "3.3.5-dev0-native",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 45,

View File

@ -1,6 +1,7 @@
from typing import Optional
from contextvars import ContextVar
from contextlib import contextmanager
import math
import flashinfer
import torch
@ -20,6 +21,20 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
workspace: Optional[torch.Tensor] = None
def unpad_2d_mask(
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
) -> torch.Tensor:
# Like torch unpad_sequence, but for 2D masks.
unpadded_tensors = []
for i, length in enumerate(seq_lengths):
unpadded_matrix = attention_mask[i, :length, :length]
unpadded_tensors.append(unpadded_matrix.flatten())
packed_tensor = torch.cat(unpadded_tensors)
return packed_tensor
def get_workspace(device):
"""Get shared flashinfer workspace."""
global workspace
@ -83,6 +98,15 @@ def use_prefill_with_paged_kv_state(
last_page_len += 1
token = prefill_with_paged_kv_state.set(state)
# Attention masks are padded, unpad.
if custom_mask is not None:
bs = input_lengths.shape[0]
seq_len = math.isqrt(custom_mask.numel() // bs)
custom_mask = unpad_2d_mask(
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
)
try:
state.plan(
qo_indptr=cu_seqlens,