Compare commits

...

6 Commits
v3.3.5 ... main

Author SHA1 Message Date
drbh
1b90c508af
Revert "Revert "feat: bump flake including transformers and huggingfa… (#3326)
Revert "Revert "feat: bump flake including transformers and huggingface_hub v…"

This reverts commit 9dedeb89ac.
2025-09-09 10:44:25 -04:00
Eliott C.
d2ad7c484e
Update iframe sources for streaming demo (#3327) 2025-09-09 15:36:19 +02:00
Daniël de Kok
c6071749db
Fix mask passed to flashinfer (#3324)
Custom masks are padded to the shape `[batch_size, max_len, max_len]`.
However, flashinfer expects an unpadded mask of the shape
`[sum(q_len[i] * k_len[i] for i in range(batch_size)]`.

This change unpads the custom mask (currently only used by Gemma 3)
to this shape (assuming q_len == k_len, since we only use the custom
mask during prefill).
2025-09-08 13:47:03 -04:00
drbh
4f067c22c3
fix: remove azure (#3325) 2025-09-08 13:41:45 -04:00
drbh
9dedeb89ac
Revert "feat: bump flake including transformers and huggingface_hub versions" (#3323)
Revert "feat: bump flake including transformers and huggingface_hub versions …"

This reverts commit 356de85c29.
2025-09-08 12:17:29 +02:00
Phil
5739b5b088
Add missing backslash (#3311) 2025-09-06 09:50:14 +02:00
4 changed files with 27 additions and 11 deletions

View File

@ -175,13 +175,6 @@ jobs:
registry: docker.io registry: docker.io
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }} password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Login to Azure Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
# If pull request # If pull request
- name: Extract metadata (tags, labels) for Docker - name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name == 'pull_request' }} if: ${{ github.event_name == 'pull_request' }}
@ -203,7 +196,6 @@ jobs:
images: | images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: | tags: |
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }} type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }} type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}

View File

@ -83,7 +83,7 @@ docker run \
-e "HF_TOKEN=$HF_TOKEN" \ -e "HF_TOKEN=$HF_TOKEN" \
-v "$HOME/models:/app/models" \ -v "$HOME/models:/app/models" \
tgi-llamacpp \ tgi-llamacpp \
--n-gpu-layers 99 --n-gpu-layers 99 \
--model-id "Qwen/Qwen2.5-3B-Instruct" --model-id "Qwen/Qwen2.5-3B-Instruct"
``` ```

View File

@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
<div class="block dark:hidden"> <div class="block dark:hidden">
<iframe <iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light" src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
width="850" width="850"
height="350" height="350"
></iframe> ></iframe>
</div> </div>
<div class="hidden dark:block"> <div class="hidden dark:block">
<iframe <iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark" src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
width="850" width="850"
height="350" height="350"
></iframe> ></iframe>

View File

@ -1,6 +1,7 @@
from typing import Optional from typing import Optional
from contextvars import ContextVar from contextvars import ContextVar
from contextlib import contextmanager from contextlib import contextmanager
import math
import flashinfer import flashinfer
import torch import torch
@ -20,6 +21,20 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
workspace: Optional[torch.Tensor] = None workspace: Optional[torch.Tensor] = None
def unpad_2d_mask(
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
) -> torch.Tensor:
# Like torch unpad_sequence, but for 2D masks.
unpadded_tensors = []
for i, length in enumerate(seq_lengths):
unpadded_matrix = attention_mask[i, :length, :length]
unpadded_tensors.append(unpadded_matrix.flatten())
packed_tensor = torch.cat(unpadded_tensors)
return packed_tensor
def get_workspace(device): def get_workspace(device):
"""Get shared flashinfer workspace.""" """Get shared flashinfer workspace."""
global workspace global workspace
@ -83,6 +98,15 @@ def use_prefill_with_paged_kv_state(
last_page_len += 1 last_page_len += 1
token = prefill_with_paged_kv_state.set(state) token = prefill_with_paged_kv_state.set(state)
# Attention masks are padded, unpad.
if custom_mask is not None:
bs = input_lengths.shape[0]
seq_len = math.isqrt(custom_mask.numel() // bs)
custom_mask = unpad_2d_mask(
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
)
try: try:
state.plan( state.plan(
qo_indptr=cu_seqlens, qo_indptr=cu_seqlens,