mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-13 13:24:53 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
8d029d2fc3 |
8
.github/workflows/build.yaml
vendored
8
.github/workflows/build.yaml
vendored
@ -175,6 +175,13 @@ jobs:
|
|||||||
registry: docker.io
|
registry: docker.io
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
|
- name: Login to Azure Container Registry
|
||||||
|
if: github.event_name != 'pull_request'
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
|
||||||
|
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
|
||||||
# If pull request
|
# If pull request
|
||||||
- name: Extract metadata (tags, labels) for Docker
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
if: ${{ github.event_name == 'pull_request' }}
|
if: ${{ github.event_name == 'pull_request' }}
|
||||||
@ -196,6 +203,7 @@ jobs:
|
|||||||
images: |
|
images: |
|
||||||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
||||||
ghcr.io/huggingface/text-generation-inference
|
ghcr.io/huggingface/text-generation-inference
|
||||||
|
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
|
||||||
tags: |
|
tags: |
|
||||||
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
|
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
|
||||||
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
|
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
|
||||||
|
16
Cargo.lock
generated
16
Cargo.lock
generated
@ -4650,7 +4650,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-backends-trtllm"
|
name = "text-generation-backends-trtllm"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"clap 4.5.32",
|
"clap 4.5.32",
|
||||||
@ -4671,7 +4671,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-benchmark"
|
name = "text-generation-benchmark"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"average",
|
"average",
|
||||||
"clap 4.5.32",
|
"clap 4.5.32",
|
||||||
@ -4691,7 +4691,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-client"
|
name = "text-generation-client"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"base64 0.22.1",
|
"base64 0.22.1",
|
||||||
@ -4709,7 +4709,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-launcher"
|
name = "text-generation-launcher"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 4.5.32",
|
"clap 4.5.32",
|
||||||
"ctrlc",
|
"ctrlc",
|
||||||
@ -4730,7 +4730,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router"
|
name = "text-generation-router"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-stream",
|
"async-stream",
|
||||||
@ -4782,7 +4782,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router-llamacpp"
|
name = "text-generation-router-llamacpp"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"bindgen 0.71.1",
|
"bindgen 0.71.1",
|
||||||
@ -4800,7 +4800,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router-v2"
|
name = "text-generation-router-v2"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
@ -4849,7 +4849,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router-v3"
|
name = "text-generation-router-v3"
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"async-trait",
|
"async-trait",
|
||||||
|
@ -21,7 +21,7 @@ default-members = [
|
|||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "3.3.5-dev0"
|
version = "3.3.5"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["Olivier Dehaene"]
|
authors = ["Olivier Dehaene"]
|
||||||
homepage = "https://github.com/huggingface/text-generation-inference"
|
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
"name": "Apache 2.0",
|
"name": "Apache 2.0",
|
||||||
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
||||||
},
|
},
|
||||||
"version": "3.3.5-dev0"
|
"version": "3.3.5"
|
||||||
},
|
},
|
||||||
"paths": {
|
"paths": {
|
||||||
"/": {
|
"/": {
|
||||||
|
@ -83,7 +83,7 @@ docker run \
|
|||||||
-e "HF_TOKEN=$HF_TOKEN" \
|
-e "HF_TOKEN=$HF_TOKEN" \
|
||||||
-v "$HOME/models:/app/models" \
|
-v "$HOME/models:/app/models" \
|
||||||
tgi-llamacpp \
|
tgi-llamacpp \
|
||||||
--n-gpu-layers 99 \
|
--n-gpu-layers 99
|
||||||
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
|
|||||||
|
|
||||||
<div class="block dark:hidden">
|
<div class="block dark:hidden">
|
||||||
<iframe
|
<iframe
|
||||||
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
|
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
|
||||||
width="850"
|
width="850"
|
||||||
height="350"
|
height="350"
|
||||||
></iframe>
|
></iframe>
|
||||||
</div>
|
</div>
|
||||||
<div class="hidden dark:block">
|
<div class="hidden dark:block">
|
||||||
<iframe
|
<iframe
|
||||||
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
|
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
|
||||||
width="850"
|
width="850"
|
||||||
height="350"
|
height="350"
|
||||||
></iframe>
|
></iframe>
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 42,
|
"completion_tokens": 42,
|
||||||
"prompt_tokens": 277,
|
"prompt_tokens": 277,
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 62,
|
"completion_tokens": 62,
|
||||||
"prompt_tokens": 277,
|
"prompt_tokens": 277,
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 67,
|
"completion_tokens": 67,
|
||||||
"prompt_tokens": 277,
|
"prompt_tokens": 277,
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 72,
|
"completion_tokens": 72,
|
||||||
"prompt_tokens": 275,
|
"prompt_tokens": 275,
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 80,
|
"completion_tokens": 80,
|
||||||
"prompt_tokens": 279,
|
"prompt_tokens": 279,
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 35,
|
"completion_tokens": 35,
|
||||||
"prompt_tokens": 32,
|
"prompt_tokens": 32,
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "google/gemma-3-4b-it",
|
"model": "google/gemma-3-4b-it",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 44,
|
"completion_tokens": 44,
|
||||||
"prompt_tokens": 37,
|
"prompt_tokens": 37,
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 10,
|
"completion_tokens": 10,
|
||||||
"prompt_tokens": 45,
|
"prompt_tokens": 45,
|
||||||
@ -44,7 +44,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 10,
|
"completion_tokens": 10,
|
||||||
"prompt_tokens": 45,
|
"prompt_tokens": 45,
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
"id": "",
|
"id": "",
|
||||||
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
"model": "unsloth/Llama-3.2-11B-Vision-Instruct",
|
||||||
"object": "chat.completion",
|
"object": "chat.completion",
|
||||||
"system_fingerprint": "3.3.5-dev0-native",
|
"system_fingerprint": "3.3.5-native",
|
||||||
"usage": {
|
"usage": {
|
||||||
"completion_tokens": 10,
|
"completion_tokens": 10,
|
||||||
"prompt_tokens": 45,
|
"prompt_tokens": 45,
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from contextvars import ContextVar
|
from contextvars import ContextVar
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import math
|
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@ -21,20 +20,6 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
|
|||||||
workspace: Optional[torch.Tensor] = None
|
workspace: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
|
|
||||||
def unpad_2d_mask(
|
|
||||||
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
|
|
||||||
) -> torch.Tensor:
|
|
||||||
# Like torch unpad_sequence, but for 2D masks.
|
|
||||||
unpadded_tensors = []
|
|
||||||
for i, length in enumerate(seq_lengths):
|
|
||||||
unpadded_matrix = attention_mask[i, :length, :length]
|
|
||||||
unpadded_tensors.append(unpadded_matrix.flatten())
|
|
||||||
|
|
||||||
packed_tensor = torch.cat(unpadded_tensors)
|
|
||||||
|
|
||||||
return packed_tensor
|
|
||||||
|
|
||||||
|
|
||||||
def get_workspace(device):
|
def get_workspace(device):
|
||||||
"""Get shared flashinfer workspace."""
|
"""Get shared flashinfer workspace."""
|
||||||
global workspace
|
global workspace
|
||||||
@ -98,15 +83,6 @@ def use_prefill_with_paged_kv_state(
|
|||||||
last_page_len += 1
|
last_page_len += 1
|
||||||
|
|
||||||
token = prefill_with_paged_kv_state.set(state)
|
token = prefill_with_paged_kv_state.set(state)
|
||||||
|
|
||||||
# Attention masks are padded, unpad.
|
|
||||||
if custom_mask is not None:
|
|
||||||
bs = input_lengths.shape[0]
|
|
||||||
seq_len = math.isqrt(custom_mask.numel() // bs)
|
|
||||||
custom_mask = unpad_2d_mask(
|
|
||||||
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
state.plan(
|
state.plan(
|
||||||
qo_indptr=cu_seqlens,
|
qo_indptr=cu_seqlens,
|
||||||
|
Loading…
Reference in New Issue
Block a user