mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-13 05:14:52 +00:00
Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
1b90c508af | ||
|
d2ad7c484e | ||
|
c6071749db | ||
|
4f067c22c3 | ||
|
9dedeb89ac | ||
|
5739b5b088 |
8
.github/workflows/build.yaml
vendored
8
.github/workflows/build.yaml
vendored
@ -175,13 +175,6 @@ jobs:
|
|||||||
registry: docker.io
|
registry: docker.io
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||||
- name: Login to Azure Container Registry
|
|
||||||
if: github.event_name != 'pull_request'
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
|
|
||||||
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
|
|
||||||
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
|
|
||||||
# If pull request
|
# If pull request
|
||||||
- name: Extract metadata (tags, labels) for Docker
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
if: ${{ github.event_name == 'pull_request' }}
|
if: ${{ github.event_name == 'pull_request' }}
|
||||||
@ -203,7 +196,6 @@ jobs:
|
|||||||
images: |
|
images: |
|
||||||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
||||||
ghcr.io/huggingface/text-generation-inference
|
ghcr.io/huggingface/text-generation-inference
|
||||||
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
|
|
||||||
tags: |
|
tags: |
|
||||||
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
|
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
|
||||||
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
|
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
|
||||||
|
@ -83,7 +83,7 @@ docker run \
|
|||||||
-e "HF_TOKEN=$HF_TOKEN" \
|
-e "HF_TOKEN=$HF_TOKEN" \
|
||||||
-v "$HOME/models:/app/models" \
|
-v "$HOME/models:/app/models" \
|
||||||
tgi-llamacpp \
|
tgi-llamacpp \
|
||||||
--n-gpu-layers 99
|
--n-gpu-layers 99 \
|
||||||
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
|
|||||||
|
|
||||||
<div class="block dark:hidden">
|
<div class="block dark:hidden">
|
||||||
<iframe
|
<iframe
|
||||||
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
|
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
|
||||||
width="850"
|
width="850"
|
||||||
height="350"
|
height="350"
|
||||||
></iframe>
|
></iframe>
|
||||||
</div>
|
</div>
|
||||||
<div class="hidden dark:block">
|
<div class="hidden dark:block">
|
||||||
<iframe
|
<iframe
|
||||||
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
|
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
|
||||||
width="850"
|
width="850"
|
||||||
height="350"
|
height="350"
|
||||||
></iframe>
|
></iframe>
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from contextvars import ContextVar
|
from contextvars import ContextVar
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
import math
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
@ -20,6 +21,20 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
|
|||||||
workspace: Optional[torch.Tensor] = None
|
workspace: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
|
|
||||||
|
def unpad_2d_mask(
|
||||||
|
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
|
||||||
|
) -> torch.Tensor:
|
||||||
|
# Like torch unpad_sequence, but for 2D masks.
|
||||||
|
unpadded_tensors = []
|
||||||
|
for i, length in enumerate(seq_lengths):
|
||||||
|
unpadded_matrix = attention_mask[i, :length, :length]
|
||||||
|
unpadded_tensors.append(unpadded_matrix.flatten())
|
||||||
|
|
||||||
|
packed_tensor = torch.cat(unpadded_tensors)
|
||||||
|
|
||||||
|
return packed_tensor
|
||||||
|
|
||||||
|
|
||||||
def get_workspace(device):
|
def get_workspace(device):
|
||||||
"""Get shared flashinfer workspace."""
|
"""Get shared flashinfer workspace."""
|
||||||
global workspace
|
global workspace
|
||||||
@ -83,6 +98,15 @@ def use_prefill_with_paged_kv_state(
|
|||||||
last_page_len += 1
|
last_page_len += 1
|
||||||
|
|
||||||
token = prefill_with_paged_kv_state.set(state)
|
token = prefill_with_paged_kv_state.set(state)
|
||||||
|
|
||||||
|
# Attention masks are padded, unpad.
|
||||||
|
if custom_mask is not None:
|
||||||
|
bs = input_lengths.shape[0]
|
||||||
|
seq_len = math.isqrt(custom_mask.numel() // bs)
|
||||||
|
custom_mask = unpad_2d_mask(
|
||||||
|
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
state.plan(
|
state.plan(
|
||||||
qo_indptr=cu_seqlens,
|
qo_indptr=cu_seqlens,
|
||||||
|
Loading…
Reference in New Issue
Block a user