Compare commits

...

6 Commits
v3.3.5 ... main

Author SHA1 Message Date
drbh
1b90c508af
Revert "Revert "feat: bump flake including transformers and huggingfa… (#3326)
Revert "Revert "feat: bump flake including transformers and huggingface_hub v…"

This reverts commit 9dedeb89ac.
2025-09-09 10:44:25 -04:00
Eliott C.
d2ad7c484e
Update iframe sources for streaming demo (#3327) 2025-09-09 15:36:19 +02:00
Daniël de Kok
c6071749db
Fix mask passed to flashinfer (#3324)
Custom masks are padded to the shape `[batch_size, max_len, max_len]`.
However, flashinfer expects an unpadded mask of the shape
`[sum(q_len[i] * k_len[i] for i in range(batch_size)]`.

This change unpads the custom mask (currently only used by Gemma 3)
to this shape (assuming q_len == k_len, since we only use the custom
mask during prefill).
2025-09-08 13:47:03 -04:00
drbh
4f067c22c3
fix: remove azure (#3325) 2025-09-08 13:41:45 -04:00
drbh
9dedeb89ac
Revert "feat: bump flake including transformers and huggingface_hub versions" (#3323)
Revert "feat: bump flake including transformers and huggingface_hub versions …"

This reverts commit 356de85c29.
2025-09-08 12:17:29 +02:00
Phil
5739b5b088
Add missing backslash (#3311) 2025-09-06 09:50:14 +02:00
4 changed files with 27 additions and 11 deletions

View File

@ -175,13 +175,6 @@ jobs:
registry: docker.io
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Login to Azure Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
# If pull request
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name == 'pull_request' }}
@ -203,7 +196,6 @@ jobs:
images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: |
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}

View File

@ -83,7 +83,7 @@ docker run \
-e "HF_TOKEN=$HF_TOKEN" \
-v "$HOME/models:/app/models" \
tgi-llamacpp \
--n-gpu-layers 99
--n-gpu-layers 99 \
--model-id "Qwen/Qwen2.5-3B-Instruct"
```

View File

@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
<div class="block dark:hidden">
<iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
width="850"
height="350"
></iframe>
</div>
<div class="hidden dark:block">
<iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
width="850"
height="350"
></iframe>

View File

@ -1,6 +1,7 @@
from typing import Optional
from contextvars import ContextVar
from contextlib import contextmanager
import math
import flashinfer
import torch
@ -20,6 +21,20 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
workspace: Optional[torch.Tensor] = None
def unpad_2d_mask(
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
) -> torch.Tensor:
# Like torch unpad_sequence, but for 2D masks.
unpadded_tensors = []
for i, length in enumerate(seq_lengths):
unpadded_matrix = attention_mask[i, :length, :length]
unpadded_tensors.append(unpadded_matrix.flatten())
packed_tensor = torch.cat(unpadded_tensors)
return packed_tensor
def get_workspace(device):
"""Get shared flashinfer workspace."""
global workspace
@ -83,6 +98,15 @@ def use_prefill_with_paged_kv_state(
last_page_len += 1
token = prefill_with_paged_kv_state.set(state)
# Attention masks are padded, unpad.
if custom_mask is not None:
bs = input_lengths.shape[0]
seq_len = math.isqrt(custom_mask.numel() // bs)
custom_mask = unpad_2d_mask(
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
)
try:
state.plan(
qo_indptr=cu_seqlens,