Compare commits

..

1 Commits
main ... v3.3.5

Author SHA1 Message Date
Alvaro Moran
8d029d2fc3 chore: release v3.3.5 2025-09-02 16:58:41 +02:00
16 changed files with 31 additions and 47 deletions

View File

@ -175,6 +175,13 @@ jobs:
registry: docker.io registry: docker.io
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_PASSWORD }} password: ${{ secrets.DOCKERHUB_PASSWORD }}
- name: Login to Azure Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
# If pull request # If pull request
- name: Extract metadata (tags, labels) for Docker - name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name == 'pull_request' }} if: ${{ github.event_name == 'pull_request' }}
@ -196,6 +203,7 @@ jobs:
images: | images: |
registry.internal.huggingface.tech/api-inference/community/text-generation-inference registry.internal.huggingface.tech/api-inference/community/text-generation-inference
ghcr.io/huggingface/text-generation-inference ghcr.io/huggingface/text-generation-inference
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
tags: | tags: |
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }} type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }} type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}

16
Cargo.lock generated
View File

@ -4650,7 +4650,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-backends-trtllm" name = "text-generation-backends-trtllm"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"clap 4.5.32", "clap 4.5.32",
@ -4671,7 +4671,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-benchmark" name = "text-generation-benchmark"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"average", "average",
"clap 4.5.32", "clap 4.5.32",
@ -4691,7 +4691,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-client" name = "text-generation-client"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"base64 0.22.1", "base64 0.22.1",
@ -4709,7 +4709,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-launcher" name = "text-generation-launcher"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"clap 4.5.32", "clap 4.5.32",
"ctrlc", "ctrlc",
@ -4730,7 +4730,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router" name = "text-generation-router"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-stream", "async-stream",
@ -4782,7 +4782,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router-llamacpp" name = "text-generation-router-llamacpp"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"async-trait", "async-trait",
"bindgen 0.71.1", "bindgen 0.71.1",
@ -4800,7 +4800,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router-v2" name = "text-generation-router-v2"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",
@ -4849,7 +4849,7 @@ dependencies = [
[[package]] [[package]]
name = "text-generation-router-v3" name = "text-generation-router-v3"
version = "3.3.5-dev0" version = "3.3.5"
dependencies = [ dependencies = [
"async-stream", "async-stream",
"async-trait", "async-trait",

View File

@ -21,7 +21,7 @@ default-members = [
resolver = "2" resolver = "2"
[workspace.package] [workspace.package]
version = "3.3.5-dev0" version = "3.3.5"
edition = "2021" edition = "2021"
authors = ["Olivier Dehaene"] authors = ["Olivier Dehaene"]
homepage = "https://github.com/huggingface/text-generation-inference" homepage = "https://github.com/huggingface/text-generation-inference"

View File

@ -10,7 +10,7 @@
"name": "Apache 2.0", "name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0" "url": "https://www.apache.org/licenses/LICENSE-2.0"
}, },
"version": "3.3.5-dev0" "version": "3.3.5"
}, },
"paths": { "paths": {
"/": { "/": {

View File

@ -83,7 +83,7 @@ docker run \
-e "HF_TOKEN=$HF_TOKEN" \ -e "HF_TOKEN=$HF_TOKEN" \
-v "$HOME/models:/app/models" \ -v "$HOME/models:/app/models" \
tgi-llamacpp \ tgi-llamacpp \
--n-gpu-layers 99 \ --n-gpu-layers 99
--model-id "Qwen/Qwen2.5-3B-Instruct" --model-id "Qwen/Qwen2.5-3B-Instruct"
``` ```

View File

@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
<div class="block dark:hidden"> <div class="block dark:hidden">
<iframe <iframe
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light" src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
width="850" width="850"
height="350" height="350"
></iframe> ></iframe>
</div> </div>
<div class="hidden dark:block"> <div class="hidden dark:block">
<iframe <iframe
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark" src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
width="850" width="850"
height="350" height="350"
></iframe> ></iframe>

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 42, "completion_tokens": 42,
"prompt_tokens": 277, "prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 62, "completion_tokens": 62,
"prompt_tokens": 277, "prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 67, "completion_tokens": 67,
"prompt_tokens": 277, "prompt_tokens": 277,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 72, "completion_tokens": 72,
"prompt_tokens": 275, "prompt_tokens": 275,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 80, "completion_tokens": 80,
"prompt_tokens": 279, "prompt_tokens": 279,

View File

@ -14,7 +14,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 35, "completion_tokens": 35,
"prompt_tokens": 32, "prompt_tokens": 32,

View File

@ -14,7 +14,7 @@
"id": "", "id": "",
"model": "google/gemma-3-4b-it", "model": "google/gemma-3-4b-it",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 44, "completion_tokens": 44,
"prompt_tokens": 37, "prompt_tokens": 37,

View File

@ -18,7 +18,7 @@
"id": "", "id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct", "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 10, "completion_tokens": 10,
"prompt_tokens": 45, "prompt_tokens": 45,
@ -44,7 +44,7 @@
"id": "", "id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct", "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 10, "completion_tokens": 10,
"prompt_tokens": 45, "prompt_tokens": 45,

View File

@ -17,7 +17,7 @@
"id": "", "id": "",
"model": "unsloth/Llama-3.2-11B-Vision-Instruct", "model": "unsloth/Llama-3.2-11B-Vision-Instruct",
"object": "chat.completion", "object": "chat.completion",
"system_fingerprint": "3.3.5-dev0-native", "system_fingerprint": "3.3.5-native",
"usage": { "usage": {
"completion_tokens": 10, "completion_tokens": 10,
"prompt_tokens": 45, "prompt_tokens": 45,

View File

@ -1,7 +1,6 @@
from typing import Optional from typing import Optional
from contextvars import ContextVar from contextvars import ContextVar
from contextlib import contextmanager from contextlib import contextmanager
import math
import flashinfer import flashinfer
import torch import torch
@ -21,20 +20,6 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
workspace: Optional[torch.Tensor] = None workspace: Optional[torch.Tensor] = None
def unpad_2d_mask(
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
) -> torch.Tensor:
# Like torch unpad_sequence, but for 2D masks.
unpadded_tensors = []
for i, length in enumerate(seq_lengths):
unpadded_matrix = attention_mask[i, :length, :length]
unpadded_tensors.append(unpadded_matrix.flatten())
packed_tensor = torch.cat(unpadded_tensors)
return packed_tensor
def get_workspace(device): def get_workspace(device):
"""Get shared flashinfer workspace.""" """Get shared flashinfer workspace."""
global workspace global workspace
@ -98,15 +83,6 @@ def use_prefill_with_paged_kv_state(
last_page_len += 1 last_page_len += 1
token = prefill_with_paged_kv_state.set(state) token = prefill_with_paged_kv_state.set(state)
# Attention masks are padded, unpad.
if custom_mask is not None:
bs = input_lengths.shape[0]
seq_len = math.isqrt(custom_mask.numel() // bs)
custom_mask = unpad_2d_mask(
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
)
try: try:
state.plan( state.plan(
qo_indptr=cu_seqlens, qo_indptr=cu_seqlens,