mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-13 05:14:52 +00:00
Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
1b90c508af | ||
|
d2ad7c484e | ||
|
c6071749db | ||
|
4f067c22c3 | ||
|
9dedeb89ac | ||
|
5739b5b088 |
8
.github/workflows/build.yaml
vendored
8
.github/workflows/build.yaml
vendored
@ -175,13 +175,6 @@ jobs:
|
||||
registry: docker.io
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_PASSWORD }}
|
||||
- name: Login to Azure Container Registry
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
|
||||
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
|
||||
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
|
||||
# If pull request
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
@ -203,7 +196,6 @@ jobs:
|
||||
images: |
|
||||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
||||
ghcr.io/huggingface/text-generation-inference
|
||||
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
|
||||
tags: |
|
||||
type=semver,pattern={{version}}${{ env.LABEL_EXTENSION }}
|
||||
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL_EXTENSION }}
|
||||
|
@ -83,7 +83,7 @@ docker run \
|
||||
-e "HF_TOKEN=$HF_TOKEN" \
|
||||
-v "$HOME/models:/app/models" \
|
||||
tgi-llamacpp \
|
||||
--n-gpu-layers 99
|
||||
--n-gpu-layers 99 \
|
||||
--model-id "Qwen/Qwen2.5-3B-Instruct"
|
||||
```
|
||||
|
||||
|
@ -27,14 +27,14 @@ For example, a system can generate 100 tokens per second. If the system generate
|
||||
|
||||
<div class="block dark:hidden">
|
||||
<iframe
|
||||
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
|
||||
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=light"
|
||||
width="850"
|
||||
height="350"
|
||||
></iframe>
|
||||
</div>
|
||||
<div class="hidden dark:block">
|
||||
<iframe
|
||||
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
|
||||
src="https://huggingface-streaming-vs-non-streaming.hf.space?__theme=dark"
|
||||
width="850"
|
||||
height="350"
|
||||
></iframe>
|
||||
|
@ -1,6 +1,7 @@
|
||||
from typing import Optional
|
||||
from contextvars import ContextVar
|
||||
from contextlib import contextmanager
|
||||
import math
|
||||
|
||||
import flashinfer
|
||||
import torch
|
||||
@ -20,6 +21,20 @@ decode_state: ContextVar[flashinfer.BatchDecodeWithPagedKVCacheWrapper] = Contex
|
||||
workspace: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
def unpad_2d_mask(
|
||||
attention_mask: torch.Tensor, seq_lengths: torch.Tensor
|
||||
) -> torch.Tensor:
|
||||
# Like torch unpad_sequence, but for 2D masks.
|
||||
unpadded_tensors = []
|
||||
for i, length in enumerate(seq_lengths):
|
||||
unpadded_matrix = attention_mask[i, :length, :length]
|
||||
unpadded_tensors.append(unpadded_matrix.flatten())
|
||||
|
||||
packed_tensor = torch.cat(unpadded_tensors)
|
||||
|
||||
return packed_tensor
|
||||
|
||||
|
||||
def get_workspace(device):
|
||||
"""Get shared flashinfer workspace."""
|
||||
global workspace
|
||||
@ -83,6 +98,15 @@ def use_prefill_with_paged_kv_state(
|
||||
last_page_len += 1
|
||||
|
||||
token = prefill_with_paged_kv_state.set(state)
|
||||
|
||||
# Attention masks are padded, unpad.
|
||||
if custom_mask is not None:
|
||||
bs = input_lengths.shape[0]
|
||||
seq_len = math.isqrt(custom_mask.numel() // bs)
|
||||
custom_mask = unpad_2d_mask(
|
||||
custom_mask.reshape(bs, seq_len, seq_len), input_lengths
|
||||
)
|
||||
|
||||
try:
|
||||
state.plan(
|
||||
qo_indptr=cu_seqlens,
|
||||
|
Loading…
Reference in New Issue
Block a user