mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-24 00:12:08 +00:00
feat: add neuron backend
This commit is contained in:
parent
76bcb4948d
commit
9c25afb832
173
Dockerfile.neuron
Normal file
173
Dockerfile.neuron
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
# Fetch and extract the TGI sources
|
||||||
|
FROM alpine AS tgi
|
||||||
|
RUN mkdir -p /tgi
|
||||||
|
|
||||||
|
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
|
||||||
|
FROM alpine AS optimum-neuron
|
||||||
|
RUN mkdir -p /optimum-neuron
|
||||||
|
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
|
||||||
|
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
|
||||||
|
|
||||||
|
# Build cargo components (adapted from TGI original Dockerfile)
|
||||||
|
# Note: we cannot use the cargo-chef base image as it uses python 3.11
|
||||||
|
FROM ubuntu:22.04 AS chef
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
curl ca-certificates build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
RUN cargo install cargo-chef --locked
|
||||||
|
|
||||||
|
WORKDIR /usr/src
|
||||||
|
|
||||||
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
||||||
|
|
||||||
|
FROM chef AS planner
|
||||||
|
COPY backends/neuron/Cargo.toml Cargo.toml
|
||||||
|
COPY Cargo.lock Cargo.lock
|
||||||
|
COPY rust-toolchain.toml rust-toolchain.toml
|
||||||
|
COPY proto proto
|
||||||
|
COPY router router
|
||||||
|
COPY backends backends
|
||||||
|
COPY launcher launcher
|
||||||
|
RUN cargo chef prepare --recipe-path recipe.json
|
||||||
|
|
||||||
|
FROM chef AS builder
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
unzip python3-dev libssl-dev pkg-config \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
||||||
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
||||||
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
||||||
|
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
|
||||||
|
rm -f $PROTOC_ZIP
|
||||||
|
|
||||||
|
COPY backends/neuron/Cargo.toml Cargo.toml
|
||||||
|
COPY --from=planner /usr/src/recipe.json recipe.json
|
||||||
|
RUN cargo chef cook --release --recipe-path recipe.json
|
||||||
|
|
||||||
|
COPY Cargo.lock Cargo.lock
|
||||||
|
COPY rust-toolchain.toml rust-toolchain.toml
|
||||||
|
COPY proto proto
|
||||||
|
COPY router router
|
||||||
|
COPY backends backends
|
||||||
|
COPY launcher launcher
|
||||||
|
# Remove this line once TGI has fixed the conflict
|
||||||
|
RUN cargo update ureq --precise 2.9.7
|
||||||
|
RUN cargo build --release
|
||||||
|
|
||||||
|
# Python base image
|
||||||
|
FROM ubuntu:22.04 AS base
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
python3-pip \
|
||||||
|
python3-setuptools \
|
||||||
|
python-is-python3 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
RUN pip3 --no-cache-dir install --upgrade pip
|
||||||
|
|
||||||
|
# Python server build image
|
||||||
|
FROM base AS pyserver
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
make \
|
||||||
|
python3-venv \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
RUN install -d /pyserver
|
||||||
|
WORKDIR /pyserver
|
||||||
|
COPY backends/neuron/server server
|
||||||
|
COPY proto proto
|
||||||
|
RUN pip3 install -r server/build-requirements.txt
|
||||||
|
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package
|
||||||
|
|
||||||
|
# Neuron base image (used for deployment)
|
||||||
|
FROM base AS neuron
|
||||||
|
|
||||||
|
# Install system prerequisites
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
gnupg2 \
|
||||||
|
wget \
|
||||||
|
python3-dev \
|
||||||
|
libexpat1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
|
||||||
|
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
|
||||||
|
|
||||||
|
# Install neuronx packages
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
aws-neuronx-dkms=2.18.20.0 \
|
||||||
|
aws-neuronx-collectives=2.22.33.0-d2128d1aa \
|
||||||
|
aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
|
||||||
|
aws-neuronx-tools=2.19.0.0 \
|
||||||
|
libxml2 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt-get clean
|
||||||
|
|
||||||
|
ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
|
||||||
|
|
||||||
|
RUN pip3 install \
|
||||||
|
neuronx-cc==2.15.143.0 \
|
||||||
|
torch-neuronx==2.1.2.2.3.2 \
|
||||||
|
transformers-neuronx==0.12.313 \
|
||||||
|
neuronx-distributed==0.9.0 \
|
||||||
|
libneuronxla==2.0.5347.0 \
|
||||||
|
--extra-index-url=https://pip.repos.neuron.amazonaws.com
|
||||||
|
|
||||||
|
# Install HuggingFace packages
|
||||||
|
RUN pip3 install \
|
||||||
|
hf_transfer huggingface_hub
|
||||||
|
|
||||||
|
# Install optimum-neuron
|
||||||
|
COPY --from=optimum-neuron /optimum-neuron optimum-neuron
|
||||||
|
RUN pip3 install ./optimum-neuron
|
||||||
|
|
||||||
|
# TGI base env
|
||||||
|
ENV HUGGINGFACE_HUB_CACHE=/tmp \
|
||||||
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||||
|
PORT=80
|
||||||
|
|
||||||
|
# Disable color logs as they are not supported by CloudWatch
|
||||||
|
ENV LOGURU_COLORIZE=NO
|
||||||
|
ENV LOG_COLORIZE=0
|
||||||
|
|
||||||
|
# Install router
|
||||||
|
COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
|
||||||
|
# Install launcher
|
||||||
|
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||||
|
# Install python server
|
||||||
|
COPY --from=pyserver /pyserver/build/dist dist
|
||||||
|
RUN pip install dist/text_generation_server*.tar.gz
|
||||||
|
|
||||||
|
# AWS Sagemaker compatible image
|
||||||
|
FROM neuron AS sagemaker
|
||||||
|
|
||||||
|
COPY backends/neuron/sagemaker-entrypoint.sh entrypoint.sh
|
||||||
|
RUN chmod +x entrypoint.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["./entrypoint.sh"]
|
||||||
|
|
||||||
|
# Final image
|
||||||
|
FROM neuron
|
||||||
|
|
||||||
|
COPY backends/neuron/tgi_env.py /tgi_env.py
|
||||||
|
COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
|
||||||
|
RUN chmod +x /tgi-entrypoint.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/tgi-entrypoint.sh"]
|
47
backends/neuron/Cargo.toml
Normal file
47
backends/neuron/Cargo.toml
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
[workspace]
|
||||||
|
members = [
|
||||||
|
"backends/v2",
|
||||||
|
"backends/grpc-metadata",
|
||||||
|
"launcher",
|
||||||
|
"router"
|
||||||
|
]
|
||||||
|
default-members = [
|
||||||
|
"backends/v2",
|
||||||
|
"backends/grpc-metadata",
|
||||||
|
"launcher",
|
||||||
|
"router"
|
||||||
|
]
|
||||||
|
resolver = "2"
|
||||||
|
|
||||||
|
[workspace.package]
|
||||||
|
version = "3.0.0"
|
||||||
|
edition = "2021"
|
||||||
|
authors = ["Olivier Dehaene"]
|
||||||
|
homepage = "https://github.com/huggingface/text-generation-inference"
|
||||||
|
|
||||||
|
[workspace.dependencies]
|
||||||
|
base64 = "0.22.0"
|
||||||
|
tokenizers = { version = "0.20.0", features = ["http"] }
|
||||||
|
hf-hub = { version = "0.3.1", features = ["tokio"] }
|
||||||
|
metrics = { version = "0.23.0" }
|
||||||
|
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
|
||||||
|
minijinja = { version = "2.2.0", features = ["json"] }
|
||||||
|
minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
|
||||||
|
pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
|
||||||
|
|
||||||
|
[profile.release]
|
||||||
|
incremental = true
|
||||||
|
|
||||||
|
[profile.release-binary]
|
||||||
|
inherits = "release"
|
||||||
|
debug = 1
|
||||||
|
incremental = true
|
||||||
|
panic = "abort"
|
||||||
|
|
||||||
|
[profile.release-opt]
|
||||||
|
inherits = "release"
|
||||||
|
debug = 0
|
||||||
|
incremental = false
|
||||||
|
lto = "fat"
|
||||||
|
opt-level = 3
|
||||||
|
codegen-units = 1
|
28
backends/neuron/Makefile
Normal file
28
backends/neuron/Makefile
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
|
||||||
|
mkfile_dir := $(dir $(mkfile_path))
|
||||||
|
root_dir := "${mkfile_dir}/../.."
|
||||||
|
|
||||||
|
.PHONY: image
|
||||||
|
|
||||||
|
VERSION := $(shell gawk 'match($$0, /^version = "(.*)"/, a) {print a[1]}' ${root_dir}/Cargo.toml)
|
||||||
|
|
||||||
|
image:
|
||||||
|
docker build --rm -f ${root_dir}/Dockerfile.neuron \
|
||||||
|
--build-arg VERSION=$(VERSION) \
|
||||||
|
--ulimit nofile=100000:100000 \
|
||||||
|
-t text-generation-inference:$(VERSION)-neuron ${root_dir}
|
||||||
|
docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
|
25
backends/neuron/README.md
Normal file
25
backends/neuron/README.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Text-generation-inference - Neuron backend for AWS Trainium and inferentia2
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This is the TGI backend for AWS Neuron Trainium and Inferentia family of chips.
|
||||||
|
|
||||||
|
This backend is composed of:
|
||||||
|
- the AWS Neuron SDK,
|
||||||
|
- the legacy v2 TGI launcher and router,
|
||||||
|
- a neuron specific inference server for text-generation.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Please refer to the official [documentation](https://huggingface.co/docs/text-generation-inference/backends/neuron).
|
||||||
|
|
||||||
|
## Build your own image
|
||||||
|
|
||||||
|
The simplest way to build TGI with the neuron backend is to use the provided `Makefile`:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ make -C backends/neuron image
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, you can build the image directly from the top directory using a command similar to the one defined
|
||||||
|
in the `Makefile` under the `image` target.
|
22
backends/neuron/sagemaker-entrypoint.sh
Normal file
22
backends/neuron/sagemaker-entrypoint.sh
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [[ -z "${HF_MODEL_ID}" ]]; then
|
||||||
|
echo "HF_MODEL_ID must be set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export MODEL_ID="${HF_MODEL_ID}"
|
||||||
|
|
||||||
|
if [[ -n "${HF_MODEL_REVISION}" ]]; then
|
||||||
|
export REVISION="${HF_MODEL_REVISION}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
|
||||||
|
export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "${MAX_BATCH_SIZE}" ]]; then
|
||||||
|
echo "MAX_BATCH_SIZE must be set to the model static batch size"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
text-generation-launcher --port 8080
|
1
backends/neuron/server/.gitignore
vendored
Normal file
1
backends/neuron/server/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
build
|
57
backends/neuron/server/Makefile
Normal file
57
backends/neuron/server/Makefile
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# Initialize base variables
|
||||||
|
pkg_name := text_generation_server
|
||||||
|
BUILDDIR ?= $(CURDIR)/build
|
||||||
|
VERSION ?= 0.0.1
|
||||||
|
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
|
||||||
|
mkfile_dir := $(dir $(mkfile_path))
|
||||||
|
pkg_dir := $(BUILDDIR)/$(pkg_name)
|
||||||
|
pkg_dist := ${BUILDDIR}/dist/${pkg_name}-${VERSION}.tar.gz
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(BUILDDIR)/*
|
||||||
|
|
||||||
|
# List static sources to be deployed in the package
|
||||||
|
src_dir := $(mkfile_dir)/$(pkg_name)
|
||||||
|
sources := $(wildcard $(src_dir)/*.py)
|
||||||
|
deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))
|
||||||
|
|
||||||
|
# Static files are just copied
|
||||||
|
|
||||||
|
define COPY
|
||||||
|
cp -f $< $@
|
||||||
|
endef
|
||||||
|
|
||||||
|
$(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml
|
||||||
|
mkdir -p $(BUILDDIR)
|
||||||
|
$(COPY)
|
||||||
|
sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@
|
||||||
|
|
||||||
|
$(pkg_dir)/%.py: $(src_dir)/%.py
|
||||||
|
mkdir -p $(pkg_dir)
|
||||||
|
$(COPY)
|
||||||
|
|
||||||
|
# Generated files are produced by grpcio tools
|
||||||
|
|
||||||
|
# If not provided, get local proto files
|
||||||
|
ifndef PROTODIR
|
||||||
|
PROTODIR := $(mkfile_dir)/../../../proto
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Three python files are generated for each protobuf
|
||||||
|
protobufs := $(PROTODIR)/generate.proto
|
||||||
|
pkg_pb_dir := $(pkg_dir)/pb
|
||||||
|
generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
|
||||||
|
generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
|
||||||
|
generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
|
||||||
|
generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
|
||||||
|
|
||||||
|
$(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
|
||||||
|
mkdir -p $(pkg_pb_dir)
|
||||||
|
python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
|
||||||
|
--grpc_python_out=$(pkg_pb_dir) --mypy_out=$(pkg_pb_dir) $^
|
||||||
|
sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
|
||||||
|
|
||||||
|
${pkg_dist}: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
|
||||||
|
python -m build $(BUILDDIR)
|
||||||
|
|
||||||
|
package: ${pkg_dist}
|
3
backends/neuron/server/build-requirements.txt
Normal file
3
backends/neuron/server/build-requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
build
|
||||||
|
grpcio-tools==1.53.0
|
||||||
|
mypy-protobuf
|
25
backends/neuron/server/pyproject.toml
Normal file
25
backends/neuron/server/pyproject.toml
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "text-generation-server"
|
||||||
|
version = "VERSION"
|
||||||
|
authors = [{name="David Corvoysier", email="david@huggingface.co" }]
|
||||||
|
description = "TGI compatible inference server for AWS Neuronx platforms"
|
||||||
|
dependencies = [
|
||||||
|
'protobuf > 3.20.1, < 4',
|
||||||
|
'grpcio == 1.57.0',
|
||||||
|
'grpcio-status == 1.48.2',
|
||||||
|
'grpcio-reflection == 1.48.2',
|
||||||
|
'grpc-interceptor == 0.15.2',
|
||||||
|
'typer == 0.6.1',
|
||||||
|
'safetensors',
|
||||||
|
'loguru == 0.6.0'
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
packages = ["text_generation_server", "text_generation_server.pb"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
text-generation-server = 'text_generation_server.cli:app'
|
111
backends/neuron/server/text_generation_server/cli.py
Normal file
111
backends/neuron/server/text_generation_server/cli.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
import sys
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def serve(
|
||||||
|
model_id: str,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
sharded: bool = False,
|
||||||
|
trust_remote_code: bool = None,
|
||||||
|
uds_path: str = "/tmp/text-generation-server",
|
||||||
|
logger_level: str = "INFO",
|
||||||
|
json_output: bool = False,
|
||||||
|
otlp_endpoint: Optional[str] = None,
|
||||||
|
otlp_service_name: str = "text-generation-inference.server",
|
||||||
|
max_input_tokens: Optional[int] = None,
|
||||||
|
):
|
||||||
|
"""This is the main entry-point for the server CLI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_id (`str`):
|
||||||
|
The *model_id* of a model on the HuggingFace hub or the path to a local model.
|
||||||
|
revision (`Optional[str]`, defaults to `None`):
|
||||||
|
The revision of the model on the HuggingFace hub.
|
||||||
|
sharded (`bool`):
|
||||||
|
Whether the model must be sharded or not. Kept for compatibility with the
|
||||||
|
text-generation-launcher, but must be set to False.
|
||||||
|
trust-remote-code (`bool`):
|
||||||
|
Kept for compatibility with text-generation-launcher. Ignored.
|
||||||
|
uds_path (`Union[Path, str]`):
|
||||||
|
The local path on which the server will expose its google RPC services.
|
||||||
|
logger_level (`str`):
|
||||||
|
The server logger level. Defaults to *INFO*.
|
||||||
|
json_output (`bool`):
|
||||||
|
Use JSON format for log serialization.
|
||||||
|
otlp_endpoint (`Optional[str]`, defaults to `None`):
|
||||||
|
The Open Telemetry endpoint to use.
|
||||||
|
otlp_service_name (`Optional[str]`, defaults to `None`):
|
||||||
|
The name to use when pushing data to the Open Telemetry endpoint.
|
||||||
|
max_input_tokens (`Optional[int]`, defaults to `None`):
|
||||||
|
The maximum number of input tokens each request should contain.
|
||||||
|
"""
|
||||||
|
if sharded:
|
||||||
|
raise ValueError("Sharding is not supported.")
|
||||||
|
# Remove default handler
|
||||||
|
logger.remove()
|
||||||
|
logger.add(
|
||||||
|
sys.stdout,
|
||||||
|
format="{message}",
|
||||||
|
filter="text_generation_server",
|
||||||
|
level=logger_level,
|
||||||
|
serialize=json_output,
|
||||||
|
backtrace=True,
|
||||||
|
diagnose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if trust_remote_code is not None:
|
||||||
|
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
|
||||||
|
|
||||||
|
# Import here after the logger is added to log potential import exceptions
|
||||||
|
from .server import serve
|
||||||
|
|
||||||
|
serve(model_id, revision, uds_path)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def download_weights(
|
||||||
|
model_id: str,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
logger_level: str = "INFO",
|
||||||
|
json_output: bool = False,
|
||||||
|
auto_convert: Optional[bool] = None,
|
||||||
|
extension: Optional[str] = None,
|
||||||
|
trust_remote_code: Optional[bool] = None,
|
||||||
|
merge_lora: Optional[bool] = None,
|
||||||
|
):
|
||||||
|
"""Download the model weights.
|
||||||
|
|
||||||
|
This command will be called by text-generation-launcher before serving the model.
|
||||||
|
"""
|
||||||
|
# Remove default handler
|
||||||
|
logger.remove()
|
||||||
|
logger.add(
|
||||||
|
sys.stdout,
|
||||||
|
format="{message}",
|
||||||
|
filter="text_generation_server",
|
||||||
|
level=logger_level,
|
||||||
|
serialize=json_output,
|
||||||
|
backtrace=True,
|
||||||
|
diagnose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if extension is not None:
|
||||||
|
logger.warning("'extension' argument is not supported and will be ignored.")
|
||||||
|
if trust_remote_code is not None:
|
||||||
|
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")
|
||||||
|
if auto_convert is not None:
|
||||||
|
logger.warning("'auto_convert' argument is not supported and will be ignored.")
|
||||||
|
if merge_lora is not None:
|
||||||
|
logger.warning("'merge_lora' argument is not supported and will be ignored.")
|
||||||
|
|
||||||
|
# Import here after the logger is added to log potential import exceptions
|
||||||
|
from .model import fetch_model
|
||||||
|
|
||||||
|
fetch_model(model_id, revision)
|
636
backends/neuron/server/text_generation_server/generator.py
Normal file
636
backends/neuron/server/text_generation_server/generator.py
Normal file
@ -0,0 +1,636 @@
|
|||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from abc import ABC
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from loguru import logger
|
||||||
|
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
from transformers.generation import GenerationConfig
|
||||||
|
|
||||||
|
from optimum.neuron import NeuronModelForCausalLM
|
||||||
|
from optimum.neuron.generation import TokenSelector
|
||||||
|
|
||||||
|
from .model import get_export_kwargs_from_env
|
||||||
|
from .pb.generate_pb2 import (
|
||||||
|
Batch,
|
||||||
|
CachedBatch,
|
||||||
|
FinishReason,
|
||||||
|
GeneratedText,
|
||||||
|
Generation,
|
||||||
|
InfoResponse,
|
||||||
|
Request,
|
||||||
|
Tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Disable optimum-neuron warnings as it seems to block the server after a while
|
||||||
|
optimum_logger = logging.getLogger("optimum.neuron")
|
||||||
|
optimum_logger.setLevel("CRITICAL")
|
||||||
|
|
||||||
|
|
||||||
|
class Generator(ABC):
|
||||||
|
"""An abstract class to represent the workhorse behind TextGenerationService.
|
||||||
|
|
||||||
|
Ideally, it should not rely on protobuf constructs, but in a first step it does.
|
||||||
|
Implementations would typically need a model and a tokenizer to implement the Generator methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def info(self) -> InfoResponse:
|
||||||
|
"""This should simply return the expected InfoResponse"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def warmup(self, batch: Batch) -> int:
|
||||||
|
"""Verify if the hardware can support the target load.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch (`Batch`):
|
||||||
|
A batch corresponding to the maximum number of concurrent requests.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
The maximum number of tokens the model supports.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
|
||||||
|
"""Prefill is called whenever new requests need to be added.
|
||||||
|
|
||||||
|
When this method returns successfully, a decode method will follow
|
||||||
|
with both the current and newly prefilled batch(es).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch (`Batch`):
|
||||||
|
A batch containing the new requests.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def decode(self, batches: List[Batch]) -> Tuple[List[Generation], CachedBatch]:
|
||||||
|
"""Decode after a prefill or another decode."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def filter(self, batch_id: int, request_ids: List[int]) -> CachedBatch:
|
||||||
|
"""Remove requests that are not listed from the specified batch"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""Remove all requests from the generator"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, model_id: str, revision: Optional[str]):
|
||||||
|
"""Factory method "a la transformers" """
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class Slot:
|
||||||
|
"""Represents a slot in a static batch"""
|
||||||
|
|
||||||
|
class State(Enum):
|
||||||
|
EMPTY = 0
|
||||||
|
PAUSE = 1
|
||||||
|
READY = 2
|
||||||
|
|
||||||
|
def __init__(self, id: int, tokenizer: PreTrainedTokenizerBase):
|
||||||
|
self._id = id
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
self.clear()
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""Clear the slot and mark it as available."""
|
||||||
|
self._state = Slot.State.EMPTY
|
||||||
|
self._batch_id = None
|
||||||
|
self._request_id = None
|
||||||
|
self._inputs = ""
|
||||||
|
self._truncate = 0
|
||||||
|
self._generation_config = None
|
||||||
|
self._tokens = []
|
||||||
|
self._mask = torch.tensor([])
|
||||||
|
self._selector = None
|
||||||
|
self._generated_tokens = 0
|
||||||
|
self._next_text_token_start = 0
|
||||||
|
self._next_text_token_end = 0
|
||||||
|
self._generated_text = ""
|
||||||
|
self._next_text = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def id(self) -> int:
|
||||||
|
return self._id
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state(self) -> "Slot.State":
|
||||||
|
return self._state
|
||||||
|
|
||||||
|
@property
|
||||||
|
def batch_id(self) -> int:
|
||||||
|
return self._batch_id
|
||||||
|
|
||||||
|
@property
|
||||||
|
def request_id(self) -> int:
|
||||||
|
return self._request_id
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cached_text(self) -> str:
|
||||||
|
return self._inputs + self._generated_text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def generation_config(self) -> GenerationConfig:
|
||||||
|
return self._generation_config
|
||||||
|
|
||||||
|
@property
|
||||||
|
def generated_tokens(self) -> int:
|
||||||
|
return self._generated_tokens
|
||||||
|
|
||||||
|
def assign(self, batch_id: int, request: Request, generation_config: GenerationConfig):
|
||||||
|
"""Assign a request to a slot.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request (`Request`):
|
||||||
|
The request to be assigned. Contains the inputs and tokens selection parameters.
|
||||||
|
generation_config (`transformers.GenerationConfig`):
|
||||||
|
The base generation config (might be modified by the request generation parameters).
|
||||||
|
"""
|
||||||
|
self._state = Slot.State.READY
|
||||||
|
self._batch_id = batch_id
|
||||||
|
self._request_id = request.id
|
||||||
|
self._inputs = request.inputs
|
||||||
|
if request.truncate:
|
||||||
|
self._truncate = request.truncate
|
||||||
|
self._generation_config = copy.deepcopy(generation_config)
|
||||||
|
# Update generation config with request parameters
|
||||||
|
self._generation_config.do_sample = request.parameters.do_sample
|
||||||
|
if self._generation_config.do_sample:
|
||||||
|
if request.parameters.temperature != 0:
|
||||||
|
self._generation_config.temperature = request.parameters.temperature
|
||||||
|
if request.parameters.top_k != 0:
|
||||||
|
self._generation_config.top_k = request.parameters.top_k
|
||||||
|
if request.parameters.top_p != 0:
|
||||||
|
self._generation_config.top_p = request.parameters.top_p
|
||||||
|
if request.parameters.typical_p != 0:
|
||||||
|
self._generation_config.typical_p = request.parameters.typical_p
|
||||||
|
if request.parameters.repetition_penalty != 0:
|
||||||
|
self._generation_config.repetition_penalty = request.parameters.repetition_penalty
|
||||||
|
self.seed = request.parameters.seed
|
||||||
|
self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
|
||||||
|
self._max_new_tokens = self._generation_config.max_new_tokens
|
||||||
|
stop_strings = request.stopping_parameters.stop_sequences
|
||||||
|
if stop_strings:
|
||||||
|
self._generation_config.stop_strings = stop_strings
|
||||||
|
|
||||||
|
def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, selector: TokenSelector):
|
||||||
|
"""Reset the slot for the next generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: (`torch.LongTensor`):
|
||||||
|
The new input_ids to use to generate the next token.
|
||||||
|
attention_mask: (`torch.LongTensor`):
|
||||||
|
The new attention_mask to use to generate the next token.
|
||||||
|
selector: (`optimum.neuron.generation.TokenSelector`):
|
||||||
|
An object implementing the updated token selection logic.
|
||||||
|
"""
|
||||||
|
self._tokens = input_ids.clone()
|
||||||
|
self._next_text_token_start = 0
|
||||||
|
self._next_text_token_end = torch.numel(self._tokens)
|
||||||
|
self._next_text = ""
|
||||||
|
self._mask = attention_mask.clone()
|
||||||
|
self._selector = selector
|
||||||
|
|
||||||
|
def pause(self, reset_on_pause: bool):
|
||||||
|
"""Mark the current slot as paused for generation.
|
||||||
|
|
||||||
|
Note that the KV cache for this slot will still be filled.
|
||||||
|
"""
|
||||||
|
if reset_on_pause:
|
||||||
|
# Drop the last token as it will be added back when resuming the slot
|
||||||
|
self._generated_tokens -= 1
|
||||||
|
# Since generated tokens are now part of the prefill, we need to reevaluate
|
||||||
|
# max_new_tokens for the next generation
|
||||||
|
self._generation_config.max_new_tokens = self._max_new_tokens - self._generated_tokens
|
||||||
|
self._state = Slot.State.PAUSE
|
||||||
|
|
||||||
|
def resume(self):
|
||||||
|
"""Mark the slot as ready for generation."""
|
||||||
|
self._state = Slot.State.READY
|
||||||
|
|
||||||
|
def _decode_next_tokens(
|
||||||
|
self,
|
||||||
|
) -> str:
|
||||||
|
"""Hack to hopefully support generate_stream for the maximum number of tokenizers"""
|
||||||
|
# We need to include the tokens that produced the last text to defeat cleanup algorithms in the decode
|
||||||
|
# which decide to add a space or not depending on the surrounding ids.
|
||||||
|
new_text = self._tokenizer.decode(self._tokens[self._next_text_token_start :], skip_special_tokens=False)
|
||||||
|
if new_text.endswith("<EFBFBD>"):
|
||||||
|
# utf-8 char at the end means it's a potential unfinished byte sequence
|
||||||
|
# from byte fallback tokenization.
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Compare the generated text with the one using only the tokens producing the last one
|
||||||
|
last_text = self._tokenizer.decode(
|
||||||
|
self._tokens[self._next_text_token_start : self._next_text_token_end],
|
||||||
|
skip_special_tokens=False,
|
||||||
|
)
|
||||||
|
if len(new_text) == len(last_text):
|
||||||
|
# Nothing new was actually generated
|
||||||
|
return ""
|
||||||
|
# Return the decoded text and store its token offsets
|
||||||
|
self._next_text_token_start = self._next_text_token_end
|
||||||
|
self._next_text_token_end = torch.numel(self._tokens)
|
||||||
|
return new_text[len(last_text) :]
|
||||||
|
|
||||||
|
def append(self, next_token: int) -> str:
|
||||||
|
"""Append a new generated token to this slot
|
||||||
|
|
||||||
|
The new token is added to the list of generated tokens, which impacts
|
||||||
|
directly the generated_text and stopped property.
|
||||||
|
|
||||||
|
The new token is however not added immediately to the slot inputs: it will
|
||||||
|
be added later on when it has effectively been used to produce the next token.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
next_token (`int`):
|
||||||
|
The newly generated token.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
The corresponding decoded text (if any).
|
||||||
|
"""
|
||||||
|
self._tokens = torch.cat([self._tokens, torch.LongTensor([next_token])])
|
||||||
|
self._mask = torch.cat([self._mask, torch.LongTensor([1])])
|
||||||
|
self._generated_tokens += 1
|
||||||
|
next_text = self._decode_next_tokens()
|
||||||
|
# Now that a new token has been generated, we can append the previous one to the generated text
|
||||||
|
self._generated_text += self._next_text
|
||||||
|
self._next_text = next_text
|
||||||
|
return next_text
|
||||||
|
|
||||||
|
def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
|
||||||
|
"""Select the next token from the candidate logits.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||||
|
The sequence used as a prompt for the generation (not used in all generation modes).
|
||||||
|
logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
|
||||||
|
The logits corresponding to the generated tokens.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
`torch.LongTensor`: A scalar torch.LongTensor` containing the selected token.
|
||||||
|
"""
|
||||||
|
return self._selector.select(input_ids, logits)[0]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def stopped(self) -> bool:
|
||||||
|
# Transformers stopping criteria expects a batch of input ids
|
||||||
|
input_ids = torch.unsqueeze(self._tokens, dim=0)
|
||||||
|
return self._selector.stopping_criteria(input_ids, None)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def generated_text(self) -> str:
|
||||||
|
return self._generated_text + self._next_text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def next_token(self) -> int:
|
||||||
|
return None if len(self._tokens) == 0 else self._tokens[-1]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def attention_mask(self) -> torch.LongTensor:
|
||||||
|
return self._mask
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_token(self) -> int:
|
||||||
|
return self._generation_config.max_length
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_new_tokens(self) -> int:
|
||||||
|
# The current value of max_new_tokens: might be different of the target max_new_tokens
|
||||||
|
# if the slot has been paused and resumed.
|
||||||
|
return self._generation_config.max_new_tokens
|
||||||
|
|
||||||
|
@property
|
||||||
|
def truncate(self) -> int:
|
||||||
|
return self._truncate
|
||||||
|
|
||||||
|
|
||||||
|
class NeuronGenerator(Generator):
|
||||||
|
"""A Generator for Neuron models."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: NeuronModelForCausalLM,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
):
|
||||||
|
self.model = model
|
||||||
|
self.rebuild_cache_on_prefill = not self.model.continuous_batching
|
||||||
|
# Specify padding and truncation options for decoder-only architecture
|
||||||
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||||
|
tokenizer.padding_side = "left"
|
||||||
|
tokenizer.truncation_side = "left"
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.special_tokens = self.tokenizer.all_special_ids
|
||||||
|
self.slots = [Slot(i, tokenizer) for i in range(self.model.batch_size)]
|
||||||
|
self.batch_id = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def info(self) -> InfoResponse:
|
||||||
|
"""Returns the expected InfoResponse."""
|
||||||
|
dtype = getattr(self.model.config, "torch_dtype", "float32")
|
||||||
|
return InfoResponse(
|
||||||
|
requires_padding=True,
|
||||||
|
dtype=str(dtype),
|
||||||
|
device_type="xla",
|
||||||
|
)
|
||||||
|
|
||||||
|
def warmup(self, batch: Batch) -> int:
|
||||||
|
"""Verify if the hardware can support the target load.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch (`Batch`):
|
||||||
|
A batch corresponding to the maximum number of concurrent requests.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
The maximum number of tokens the model supports.
|
||||||
|
"""
|
||||||
|
# Just check that the warmup request parameters match the model capacity
|
||||||
|
batch_size = self.model.batch_size
|
||||||
|
if len(batch.requests) > batch_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"Inconsistent batch_size configuration: Please make sure the batch_size in the compiled model (currently {batch_size}) matches the batch_size passed to TGI. The compiled model batch_size is usually in the neuron section of the model config.json file. You may also have passed it into optimum-cli during the compilation process. The batch size for TGI is usually set in the environment as MAX_BATCH_SIZE."
|
||||||
|
)
|
||||||
|
self.prefill(batch)
|
||||||
|
self.clear()
|
||||||
|
return self.model.batch_size * self.model.max_length
|
||||||
|
|
||||||
|
def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
|
||||||
|
"""Prefill new requests.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch (`Batch`):
|
||||||
|
A batch containing the new requests.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
|
||||||
|
"""
|
||||||
|
slots = {state: [] for state in Slot.State}
|
||||||
|
for slot in self.slots:
|
||||||
|
slots[slot.state].append(slot)
|
||||||
|
active_slots = slots[Slot.State.READY]
|
||||||
|
empty_slots = slots[Slot.State.EMPTY]
|
||||||
|
if len(empty_slots) < len(batch.requests):
|
||||||
|
raise ValueError(
|
||||||
|
f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots."
|
||||||
|
f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
|
||||||
|
)
|
||||||
|
# Assign each request to an empty slot
|
||||||
|
logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
|
||||||
|
new_slots = []
|
||||||
|
for request in batch.requests:
|
||||||
|
slot = empty_slots.pop()
|
||||||
|
slot.assign(self.batch_id, request, self.model.generation_config)
|
||||||
|
new_slots.append(slot)
|
||||||
|
logger.debug(
|
||||||
|
f"Request {slot.request_id} assigned to slot {slot.id} with and max_new_tokens {slot.max_new_tokens}"
|
||||||
|
)
|
||||||
|
if self.rebuild_cache_on_prefill:
|
||||||
|
# We will clear pending slots and prefill all slots
|
||||||
|
prefill_slots = self.slots
|
||||||
|
seq_ids = None
|
||||||
|
else:
|
||||||
|
# We only need to pass inputs for the new requests
|
||||||
|
prefill_slots = new_slots
|
||||||
|
seq_ids = torch.tensor([slot.id for slot in prefill_slots])
|
||||||
|
# Reconstruct the full inputs (without padding) as seen by the model.
|
||||||
|
# This comprises:
|
||||||
|
# - the inputs for new requests,
|
||||||
|
# - only when rebuilding the cache, the inputs and the generated text that has already
|
||||||
|
# been cached (i.e. excluding the last generated token) for unfinished requests.
|
||||||
|
inputs = []
|
||||||
|
max_length = 0
|
||||||
|
for slot in prefill_slots:
|
||||||
|
inputs.append(slot.cached_text)
|
||||||
|
# Apply truncation, making sure we fit into static dimensions
|
||||||
|
if slot.truncate == 0:
|
||||||
|
max_length = self.model.max_length
|
||||||
|
elif slot.truncate > max_length and slot.truncate < self.model.max_length:
|
||||||
|
max_length = slot.truncate
|
||||||
|
# Tokenize with padding and truncation
|
||||||
|
padded_inputs = self.tokenizer(
|
||||||
|
inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
|
||||||
|
)
|
||||||
|
input_ids = padded_inputs.input_ids
|
||||||
|
attention_mask = padded_inputs.attention_mask
|
||||||
|
# Pause previously active slots during generation
|
||||||
|
next_tokens = []
|
||||||
|
for slot in active_slots:
|
||||||
|
slot.pause(reset_on_pause=self.rebuild_cache_on_prefill)
|
||||||
|
if self.rebuild_cache_on_prefill:
|
||||||
|
# The slot will be reset, so we need to store its next token
|
||||||
|
next_tokens.append(slot.next_token)
|
||||||
|
# Each slot must be reset with the padded inputs and masks
|
||||||
|
for i, slot in enumerate(prefill_slots):
|
||||||
|
if slot.state != slot.state.EMPTY:
|
||||||
|
if slot.truncate > 0 and slot.truncate < input_ids.shape[-1]:
|
||||||
|
# Apply per-request truncation
|
||||||
|
input_ids[i, : -slot.truncate] = self.tokenizer.pad_token_id
|
||||||
|
attention_mask[i, : -slot.truncate] = 0
|
||||||
|
slot_input_ids = input_ids[i : i + 1, :]
|
||||||
|
# Padded input ids are also required to set logits processors and stopping criterias
|
||||||
|
selector = TokenSelector.create(
|
||||||
|
slot_input_ids,
|
||||||
|
slot.generation_config,
|
||||||
|
self.model,
|
||||||
|
self.model.max_length,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
seed=slot.seed,
|
||||||
|
)
|
||||||
|
slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
|
||||||
|
slot_attention_mask = attention_mask[i]
|
||||||
|
slot.reset(slot_input_ids, slot_attention_mask, selector)
|
||||||
|
# Note: when rebuilding cache on prefill, the new tokens on paused slots will be ignored,
|
||||||
|
# as they have already been generated and sent back in the last decode.
|
||||||
|
model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask, seq_ids)
|
||||||
|
logits = self.model(**model_inputs)[0]
|
||||||
|
generation, next_batch = self._generate_token(prefill_slots, self.batch_id, logits, input_ids)
|
||||||
|
self.batch_id += 1
|
||||||
|
# Reactivate previously active slots for the next decode
|
||||||
|
for i, slot in enumerate(active_slots):
|
||||||
|
slot.resume()
|
||||||
|
if self.rebuild_cache_on_prefill:
|
||||||
|
# Append back the next token
|
||||||
|
slot.append(next_tokens[i])
|
||||||
|
logger.debug("Model ready for decoding")
|
||||||
|
if next_batch is not None:
|
||||||
|
logger.debug(f"Next batch is {next_batch.id} with requests: {next_batch.request_ids}")
|
||||||
|
return generation, next_batch
|
||||||
|
|
||||||
|
def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBatch]:
|
||||||
|
"""Decode the specified prefilled requests.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batches (`List[CachedBatch]`):
|
||||||
|
A list of previous batches containing the prefilled requests.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
A list of `Generation` for each request and a `CachedBatch` containing all pending requests.
|
||||||
|
"""
|
||||||
|
# batches contains a list composed of:
|
||||||
|
# - the batch id returned by the last decode,
|
||||||
|
# - the batch id(s) returned by the last prefill(s)
|
||||||
|
# Batches are always concatenated during prefill, so we can
|
||||||
|
# just carry on with decoding. We adopt the id of the first
|
||||||
|
# batch in the list as our next batch id.
|
||||||
|
next_batch_id = batches[0].id
|
||||||
|
request_ids = []
|
||||||
|
for batch in batches:
|
||||||
|
request_ids += batch.request_ids
|
||||||
|
cleared_request_ids = []
|
||||||
|
for slot in self.slots:
|
||||||
|
if slot.state == slot.State.READY and slot.request_id not in request_ids:
|
||||||
|
cleared_request_ids.append(slot.request_id)
|
||||||
|
slot.clear()
|
||||||
|
if len(cleared_request_ids) > 0:
|
||||||
|
logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
|
||||||
|
active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
|
||||||
|
if len(active_slots) < len(request_ids):
|
||||||
|
raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
|
||||||
|
if self.model.continuous_batching:
|
||||||
|
decode_slots = active_slots
|
||||||
|
seq_ids = torch.tensor([slot.id for slot in decode_slots])
|
||||||
|
else:
|
||||||
|
decode_slots = self.slots
|
||||||
|
seq_ids = None
|
||||||
|
# Reconstruct input_ids and attention_mask from decode slots
|
||||||
|
n_slots = len(decode_slots)
|
||||||
|
input_ids = torch.full([n_slots, 1], fill_value=self.tokenizer.eos_token_id, dtype=torch.int64)
|
||||||
|
max_length = 0
|
||||||
|
for slot in decode_slots:
|
||||||
|
max_length = max(max_length, slot.attention_mask.size(-1))
|
||||||
|
attention_mask = torch.zeros([n_slots, max_length], dtype=torch.int64)
|
||||||
|
for i, slot in enumerate(decode_slots):
|
||||||
|
if slot.state != Slot.State.EMPTY:
|
||||||
|
# input_ids are simply the tokens generated by the last decode or prefill requests (other tokens are cached)
|
||||||
|
input_ids[i, 0] = slot.next_token
|
||||||
|
attention_mask[i, : slot.attention_mask.size(-1)] = slot.attention_mask
|
||||||
|
model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask, seq_ids)
|
||||||
|
logits = self.model(**model_inputs)[0]
|
||||||
|
return self._generate_token(decode_slots, next_batch_id, logits, input_ids)
|
||||||
|
|
||||||
|
def _generate_token(
|
||||||
|
self, slots: List[Slot], next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
|
||||||
|
) -> Tuple[List[Generation], CachedBatch]:
|
||||||
|
generations = []
|
||||||
|
active_slots = False
|
||||||
|
for i, slot in enumerate(slots):
|
||||||
|
if slot.state != Slot.State.READY:
|
||||||
|
continue
|
||||||
|
request_id = slot.request_id
|
||||||
|
next_token_logits = logits[i : i + 1, -1, :]
|
||||||
|
slot_input_ids = input_ids[i : i + 1, :]
|
||||||
|
next_token = slot.select(slot_input_ids, next_token_logits)
|
||||||
|
next_token_text = slot.append(next_token)
|
||||||
|
generated_text = None
|
||||||
|
finish_reason = None
|
||||||
|
if next_token == self.tokenizer.eos_token_id:
|
||||||
|
finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
|
||||||
|
elif slot.stopped:
|
||||||
|
if slot.generated_tokens == slot.max_new_tokens:
|
||||||
|
finish_reason = FinishReason.FINISH_REASON_LENGTH
|
||||||
|
else:
|
||||||
|
finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
|
||||||
|
if finish_reason is not None:
|
||||||
|
# We must include the generated text for each finished sequence in the response
|
||||||
|
generated_text = GeneratedText(
|
||||||
|
text=slot.generated_text, generated_tokens=slot.generated_tokens, finish_reason=finish_reason
|
||||||
|
)
|
||||||
|
logger.debug(f"Decode complete for request {request_id} with {slot.generated_tokens} tokens")
|
||||||
|
# mark the slot as available
|
||||||
|
slot.clear()
|
||||||
|
else:
|
||||||
|
active_slots = True
|
||||||
|
generations.append(
|
||||||
|
Generation(
|
||||||
|
request_id=request_id,
|
||||||
|
prefill_tokens=None,
|
||||||
|
tokens=Tokens(
|
||||||
|
ids=[next_token],
|
||||||
|
logprobs=[0],
|
||||||
|
texts=[next_token_text],
|
||||||
|
is_special=[next_token in self.special_tokens],
|
||||||
|
),
|
||||||
|
generated_text=generated_text,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
batch = None
|
||||||
|
if active_slots:
|
||||||
|
# Whatever initial batch these requests came from, we always return all pending requests in a single batch
|
||||||
|
request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
|
||||||
|
batch = self._cached_batch(next_batch_id, request_ids)
|
||||||
|
else:
|
||||||
|
logger.debug("No more pending requests")
|
||||||
|
return generations, batch
|
||||||
|
|
||||||
|
def _cached_batch(self, batch_id: int, request_ids: List):
|
||||||
|
size = len(request_ids)
|
||||||
|
max_tokens = size * self.model.max_length
|
||||||
|
return CachedBatch(id=batch_id, request_ids=request_ids, size=size, max_tokens=max_tokens)
|
||||||
|
|
||||||
|
def filter(self, batch_id: int, keep_request_ids: List[int]) -> CachedBatch:
|
||||||
|
"""Remove requests that are not listed from the specified batch
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch_id (`int`):
|
||||||
|
The id of a cached batch.
|
||||||
|
keep_ids(`List[int]`):
|
||||||
|
The list of requests that must be kept.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
A `CachedBatch` containing the pending requests.
|
||||||
|
"""
|
||||||
|
keep_slot_ids = [slot.id for slot in self.slots if slot.request_id in keep_request_ids]
|
||||||
|
self._clear(keep_slot_ids)
|
||||||
|
return self._cached_batch(batch_id, keep_request_ids)
|
||||||
|
|
||||||
|
def clear(self, batch_id: Optional[int] = None):
|
||||||
|
"""Remove a subset or all requests from the generator"""
|
||||||
|
keep_ids = []
|
||||||
|
if batch_id is not None:
|
||||||
|
keep_ids = [slot.id for slot in self.slots if slot.batch_id != batch_id]
|
||||||
|
return self._clear(keep_ids)
|
||||||
|
|
||||||
|
def _clear(self, keep_slot_ids: List):
|
||||||
|
for slot in self.slots:
|
||||||
|
if slot.state != Slot.State.EMPTY and slot.id not in keep_slot_ids:
|
||||||
|
logger.debug(f"Removing slot {slot.id} with request {slot.request_id}")
|
||||||
|
slot.clear()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, model_id: str, revision: str = None):
|
||||||
|
"""Instantiate a NeuronGenerator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_id (`str`):
|
||||||
|
A hub model id or the path to a local model. This path must also contain a Tokenizer.
|
||||||
|
revision (`Optional[str]`, defaults to `None`):
|
||||||
|
The revision of the model on the HuggingFace hub.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A NeuronGenerator.
|
||||||
|
"""
|
||||||
|
config = AutoConfig.from_pretrained(model_id)
|
||||||
|
neuron_config = getattr(config, "neuron", None)
|
||||||
|
start = time.time()
|
||||||
|
if neuron_config is None:
|
||||||
|
export_kwargs = get_export_kwargs_from_env()
|
||||||
|
logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
|
||||||
|
model = NeuronModelForCausalLM.from_pretrained(
|
||||||
|
model_id, revision=revision, low_cpu_mem_usage=True, export=True, **export_kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("Loading model on neuron devices (this can take a few minutes).")
|
||||||
|
model = NeuronModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, revision=revision)
|
||||||
|
end = time.time()
|
||||||
|
logger.info(f"Model successfully loaded in {end - start:.2f} s.")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
||||||
|
return cls(model, tokenizer)
|
27
backends/neuron/server/text_generation_server/interceptor.py
Normal file
27
backends/neuron/server/text_generation_server/interceptor.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
import grpc
|
||||||
|
from google.rpc import code_pb2, status_pb2
|
||||||
|
from grpc_interceptor.server import AsyncServerInterceptor
|
||||||
|
from grpc_status import rpc_status
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
class ExceptionInterceptor(AsyncServerInterceptor):
|
||||||
|
async def intercept(
|
||||||
|
self,
|
||||||
|
method: Callable,
|
||||||
|
request_or_iterator: Any,
|
||||||
|
context: grpc.ServicerContext,
|
||||||
|
method_name: str,
|
||||||
|
) -> Any:
|
||||||
|
try:
|
||||||
|
response = method(request_or_iterator, context)
|
||||||
|
return await response
|
||||||
|
except Exception as err:
|
||||||
|
method_name = method_name.split("/")[-1]
|
||||||
|
logger.exception(f"Method {method_name} encountered an error.")
|
||||||
|
|
||||||
|
await context.abort_with_status(
|
||||||
|
rpc_status.to_status(status_pb2.Status(code=code_pb2.INTERNAL, message=str(err)))
|
||||||
|
)
|
118
backends/neuron/server/text_generation_server/model.py
Normal file
118
backends/neuron/server/text_generation_server/model.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
from huggingface_hub.constants import HF_HUB_CACHE
|
||||||
|
from loguru import logger
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
|
from optimum.neuron import NeuronModelForCausalLM
|
||||||
|
from optimum.neuron.utils import get_hub_cached_entries
|
||||||
|
|
||||||
|
|
||||||
|
def get_export_kwargs_from_env():
|
||||||
|
batch_size = os.environ.get("MAX_BATCH_SIZE", None)
|
||||||
|
if batch_size is not None:
|
||||||
|
batch_size = int(batch_size)
|
||||||
|
sequence_length = os.environ.get("MAX_TOTAL_TOKENS", None)
|
||||||
|
if sequence_length is not None:
|
||||||
|
sequence_length = int(sequence_length)
|
||||||
|
num_cores = os.environ.get("HF_NUM_CORES", None)
|
||||||
|
if num_cores is not None:
|
||||||
|
num_cores = int(num_cores)
|
||||||
|
auto_cast_type = os.environ.get("HF_AUTO_CAST_TYPE", None)
|
||||||
|
return {
|
||||||
|
"task": "text-generation",
|
||||||
|
"batch_size": batch_size,
|
||||||
|
"sequence_length": sequence_length,
|
||||||
|
"num_cores": num_cores,
|
||||||
|
"auto_cast_type": auto_cast_type,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_cached(model_id, neuron_config):
|
||||||
|
# Look for cached entries for the specified model
|
||||||
|
in_cache = False
|
||||||
|
entries = get_hub_cached_entries(model_id, "inference")
|
||||||
|
# Look for compatible entries
|
||||||
|
for entry in entries:
|
||||||
|
compatible = True
|
||||||
|
for key, value in neuron_config.items():
|
||||||
|
# Only weights can be different
|
||||||
|
if key in ["checkpoint_id", "checkpoint_revision"]:
|
||||||
|
continue
|
||||||
|
if entry[key] != value:
|
||||||
|
compatible = False
|
||||||
|
if compatible:
|
||||||
|
in_cache = True
|
||||||
|
break
|
||||||
|
return in_cache
|
||||||
|
|
||||||
|
|
||||||
|
def log_cache_size():
|
||||||
|
path = HF_HUB_CACHE
|
||||||
|
if os.path.exists(path):
|
||||||
|
usage = shutil.disk_usage(path)
|
||||||
|
gb = 2**30
|
||||||
|
logger.info(f"Cache disk [{path}]: total = {usage.total / gb:.2f} G, free = {usage.free / gb:.2f} G")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"The cache directory ({path}) does not exist.")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_model(
|
||||||
|
model_id: str,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
) -> str:
|
||||||
|
"""Fetch a neuron model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_id (`str`):
|
||||||
|
The *model_id* of a model on the HuggingFace hub or the path to a local model.
|
||||||
|
revision (`Optional[str]`, defaults to `None`):
|
||||||
|
The revision of the model on the HuggingFace hub.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string corresponding to the model_id or path.
|
||||||
|
"""
|
||||||
|
if not os.path.isdir("/sys/class/neuron_device/"):
|
||||||
|
raise SystemError("No neuron cores detected on the host.")
|
||||||
|
if os.path.isdir(model_id) and revision is not None:
|
||||||
|
logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
|
||||||
|
revision = None
|
||||||
|
# Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
|
||||||
|
# Note that the model may already be present in the cache.
|
||||||
|
config = AutoConfig.from_pretrained(model_id, revision=revision)
|
||||||
|
neuron_config = getattr(config, "neuron", None)
|
||||||
|
if neuron_config is not None:
|
||||||
|
if os.path.isdir(model_id):
|
||||||
|
return model_id
|
||||||
|
# Prefetch the neuron model from the Hub
|
||||||
|
logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
|
||||||
|
log_cache_size()
|
||||||
|
return snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
|
||||||
|
# Model needs to be exported: look for compatible cached entries on the hub
|
||||||
|
export_kwargs = get_export_kwargs_from_env()
|
||||||
|
export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
|
||||||
|
neuron_config = export_config.neuron
|
||||||
|
if not is_cached(model_id, neuron_config):
|
||||||
|
hub_cache_url = "https://huggingface.co/aws-neuron/optimum-neuron-cache"
|
||||||
|
neuron_export_url = "https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-neuronx-tgi"
|
||||||
|
error_msg = (
|
||||||
|
f"No cached version found for {model_id} with {neuron_config}."
|
||||||
|
f"You can start a discussion to request it on {hub_cache_url}"
|
||||||
|
f"Alternatively, you can export your own neuron model as explained in {neuron_export_url}"
|
||||||
|
)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
|
||||||
|
if os.path.isdir(model_id):
|
||||||
|
return model_id
|
||||||
|
# Prefetch weights, tokenizer and generation config so that they are in cache
|
||||||
|
log_cache_size()
|
||||||
|
start = time.time()
|
||||||
|
snapshot_download(model_id, revision=revision, ignore_patterns="*.bin")
|
||||||
|
end = time.time()
|
||||||
|
logger.info(f"Model weights fetched in {end - start:.2f} s.")
|
||||||
|
log_cache_size()
|
||||||
|
return model_id
|
89
backends/neuron/server/text_generation_server/server.py
Normal file
89
backends/neuron/server/text_generation_server/server.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from grpc import aio
|
||||||
|
from grpc_reflection.v1alpha import reflection
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from .generator import Generator, NeuronGenerator
|
||||||
|
from .interceptor import ExceptionInterceptor
|
||||||
|
from .pb import generate_pb2, generate_pb2_grpc
|
||||||
|
|
||||||
|
|
||||||
|
class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
|
||||||
|
def __init__(self, generator: Generator, server_urls: List[str]):
|
||||||
|
self.generator = generator
|
||||||
|
self.server_urls = server_urls
|
||||||
|
|
||||||
|
async def Info(self, request, context):
|
||||||
|
return self.generator.info
|
||||||
|
|
||||||
|
async def Health(self, request, context):
|
||||||
|
return generate_pb2.HealthResponse()
|
||||||
|
|
||||||
|
async def ServiceDiscovery(self, request, context):
|
||||||
|
return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
|
||||||
|
|
||||||
|
async def ClearCache(self, request, context):
|
||||||
|
if request.HasField("id"):
|
||||||
|
self.generator.clear(request.id)
|
||||||
|
else:
|
||||||
|
self.generator.clear()
|
||||||
|
return generate_pb2.ClearCacheResponse()
|
||||||
|
|
||||||
|
async def FilterBatch(self, request, context):
|
||||||
|
filtered_batch = self.generator.filter(request.batch_id, request.request_ids)
|
||||||
|
return generate_pb2.FilterBatchResponse(batch=filtered_batch)
|
||||||
|
|
||||||
|
async def Warmup(self, request, context):
|
||||||
|
max_tokens = self.generator.warmup(request.batch)
|
||||||
|
return generate_pb2.WarmupResponse(max_supported_total_tokens=max_tokens)
|
||||||
|
|
||||||
|
async def Prefill(self, request, context):
|
||||||
|
generations, batch = self.generator.prefill(request.batch)
|
||||||
|
return generate_pb2.PrefillResponse(generations=generations, batch=batch)
|
||||||
|
|
||||||
|
async def Decode(self, request, context):
|
||||||
|
generations, batch = self.generator.decode(request.batches)
|
||||||
|
return generate_pb2.DecodeResponse(generations=generations, batch=batch)
|
||||||
|
|
||||||
|
|
||||||
|
def serve(
|
||||||
|
model_id: str,
|
||||||
|
revision: str,
|
||||||
|
uds_path: Path,
|
||||||
|
):
|
||||||
|
async def serve_inner(model_id: str, revision: str):
|
||||||
|
unix_socket_template = "unix://{}-{}"
|
||||||
|
local_url = unix_socket_template.format(uds_path, 0)
|
||||||
|
server_urls = [local_url]
|
||||||
|
|
||||||
|
try:
|
||||||
|
generator = NeuronGenerator.from_pretrained(model_id, revision)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error when initializing model")
|
||||||
|
raise
|
||||||
|
|
||||||
|
server = aio.server(interceptors=[ExceptionInterceptor()])
|
||||||
|
generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
|
||||||
|
TextGenerationService(generator, server_urls), server
|
||||||
|
)
|
||||||
|
SERVICE_NAMES = (
|
||||||
|
generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,
|
||||||
|
reflection.SERVICE_NAME,
|
||||||
|
)
|
||||||
|
reflection.enable_server_reflection(SERVICE_NAMES, server)
|
||||||
|
server.add_insecure_port(local_url)
|
||||||
|
|
||||||
|
await server.start()
|
||||||
|
|
||||||
|
logger.info("Server started at {}".format(local_url))
|
||||||
|
|
||||||
|
try:
|
||||||
|
await server.wait_for_termination()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Signal received. Shutting down")
|
||||||
|
await server.stop(0)
|
||||||
|
|
||||||
|
asyncio.run(serve_inner(model_id, revision))
|
16
backends/neuron/tgi-entrypoint.sh
Executable file
16
backends/neuron/tgi-entrypoint.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e -o pipefail -u
|
||||||
|
|
||||||
|
export ENV_FILEPATH=$(mktemp)
|
||||||
|
|
||||||
|
trap "rm -f ${ENV_FILEPATH}" EXIT
|
||||||
|
|
||||||
|
touch $ENV_FILEPATH
|
||||||
|
|
||||||
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
|
||||||
|
${SCRIPT_DIR}/tgi_env.py $@
|
||||||
|
|
||||||
|
source $ENV_FILEPATH
|
||||||
|
|
||||||
|
exec text-generation-launcher $@
|
226
backends/neuron/tgi_env.py
Executable file
226
backends/neuron/tgi_env.py
Executable file
@ -0,0 +1,226 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from huggingface_hub import constants
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
|
from optimum.neuron.modeling_decoder import get_available_cores
|
||||||
|
from optimum.neuron.utils import get_hub_cached_entries
|
||||||
|
from optimum.neuron.utils.version_utils import get_neuronxcc_version
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS", "MAX_BATCH_PREFILL_TOKENS"]
|
||||||
|
tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
|
||||||
|
|
||||||
|
env_config_peering = [
|
||||||
|
("MAX_BATCH_SIZE", "batch_size"),
|
||||||
|
("MAX_TOTAL_TOKENS", "sequence_length"),
|
||||||
|
("HF_AUTO_CAST_TYPE", "auto_cast_type"),
|
||||||
|
("HF_NUM_CORES", "num_cores"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# By the end of this script all env var should be specified properly
|
||||||
|
env_vars = tgi_server_env_vars + tgi_router_env_vars
|
||||||
|
|
||||||
|
available_cores = get_available_cores()
|
||||||
|
neuronxcc_version = get_neuronxcc_version()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
if not argv:
|
||||||
|
argv = sys.argv
|
||||||
|
# All these are params passed to tgi and intercepted here
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
|
||||||
|
)
|
||||||
|
parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
|
||||||
|
parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
|
||||||
|
parser.add_argument("--max-batch-prefill-tokens", type=int, default=os.getenv("MAX_BATCH_PREFILL_TOKENS", 0))
|
||||||
|
parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
|
||||||
|
parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
|
||||||
|
|
||||||
|
args = parser.parse_known_args(argv)[0]
|
||||||
|
|
||||||
|
if not args.model_id:
|
||||||
|
raise Exception("No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var")
|
||||||
|
|
||||||
|
# Override env with cmdline params
|
||||||
|
os.environ["MODEL_ID"] = args.model_id
|
||||||
|
|
||||||
|
# Set all tgi router and tgi server values to consistent values as early as possible
|
||||||
|
# from the order of the parser defaults, the tgi router value can override the tgi server ones
|
||||||
|
if args.max_total_tokens > 0:
|
||||||
|
os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)
|
||||||
|
|
||||||
|
if args.max_input_tokens > 0:
|
||||||
|
os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)
|
||||||
|
|
||||||
|
if args.max_batch_size > 0:
|
||||||
|
os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)
|
||||||
|
|
||||||
|
if args.max_batch_prefill_tokens > 0:
|
||||||
|
os.environ["MAX_BATCH_PREFILL_TOKENS"] = str(args.max_batch_prefill_tokens)
|
||||||
|
|
||||||
|
if args.revision:
|
||||||
|
os.environ["REVISION"] = str(args.revision)
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def neuron_config_to_env(neuron_config):
|
||||||
|
with open(os.environ["ENV_FILEPATH"], "w") as f:
|
||||||
|
for env_var, config_key in env_config_peering:
|
||||||
|
f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
|
||||||
|
max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
|
||||||
|
if not max_input_tokens:
|
||||||
|
max_input_tokens = int(neuron_config["sequence_length"]) // 2
|
||||||
|
if max_input_tokens == 0:
|
||||||
|
raise Exception("Model sequence length should be greater than 1")
|
||||||
|
f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
|
||||||
|
max_batch_prefill_tokens = os.getenv("MAX_BATCH_PREFILL_TOKENS")
|
||||||
|
if not max_batch_prefill_tokens:
|
||||||
|
max_batch_prefill_tokens = int(neuron_config["batch_size"]) * int(max_input_tokens)
|
||||||
|
f.write("export MAX_BATCH_PREFILL_TOKENS={}\n".format(max_batch_prefill_tokens))
|
||||||
|
|
||||||
|
|
||||||
|
def sort_neuron_configs(dictionary):
|
||||||
|
return -dictionary["num_cores"], -dictionary["batch_size"]
|
||||||
|
|
||||||
|
|
||||||
|
def lookup_compatible_cached_model(model_id: str, revision: Optional[str]) -> Optional[Dict[str, Any]]:
|
||||||
|
# Reuse the same mechanic as the one in use to configure the tgi server part
|
||||||
|
# The only difference here is that we stay as flexible as possible on the compatibility part
|
||||||
|
entries = get_hub_cached_entries(model_id, "inference")
|
||||||
|
|
||||||
|
logger.debug("Found %d cached entries for model %s, revision %s", len(entries), model_id, revision)
|
||||||
|
|
||||||
|
all_compatible = []
|
||||||
|
for entry in entries:
|
||||||
|
if check_env_and_neuron_config_compatibility(entry, check_compiler_version=True):
|
||||||
|
all_compatible.append(entry)
|
||||||
|
|
||||||
|
if not all_compatible:
|
||||||
|
logger.debug(
|
||||||
|
"No compatible cached entry found for model %s, env %s, available cores %s, neuronxcc version %s",
|
||||||
|
model_id,
|
||||||
|
get_env_dict(),
|
||||||
|
available_cores,
|
||||||
|
neuronxcc_version,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.info("%d compatible neuron cached models found", len(all_compatible))
|
||||||
|
|
||||||
|
all_compatible = sorted(all_compatible, key=sort_neuron_configs)
|
||||||
|
|
||||||
|
entry = all_compatible[0]
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], check_compiler_version: bool) -> bool:
|
||||||
|
logger.debug(
|
||||||
|
"Checking the provided neuron config %s is compatible with the local setup and provided environment",
|
||||||
|
neuron_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Local setup compat checks
|
||||||
|
if neuron_config["num_cores"] > available_cores:
|
||||||
|
logger.debug("Not enough neuron cores available to run the provided neuron config")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if check_compiler_version and neuron_config["compiler_version"] != neuronxcc_version:
|
||||||
|
logger.debug(
|
||||||
|
"Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
|
||||||
|
neuronxcc_version,
|
||||||
|
neuron_config["compiler_version"],
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
for env_var, config_key in env_config_peering:
|
||||||
|
neuron_config_value = str(neuron_config[config_key])
|
||||||
|
env_value = os.getenv(env_var, str(neuron_config_value))
|
||||||
|
if env_value != neuron_config_value:
|
||||||
|
logger.debug(
|
||||||
|
"The provided env var '%s' and the neuron config '%s' param differ (%s != %s)",
|
||||||
|
env_var,
|
||||||
|
config_key,
|
||||||
|
env_value,
|
||||||
|
neuron_config_value,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
|
||||||
|
if max_input_tokens > 0:
|
||||||
|
sequence_length = neuron_config["sequence_length"]
|
||||||
|
if max_input_tokens >= sequence_length:
|
||||||
|
logger.debug(
|
||||||
|
"Specified max input tokens is not compatible with config sequence length ( %s >= %s)",
|
||||||
|
max_input_tokens,
|
||||||
|
sequence_length,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def get_env_dict() -> Dict[str, str]:
|
||||||
|
d = {}
|
||||||
|
for k in env_vars:
|
||||||
|
d[k] = os.getenv(k)
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
This script determines proper default TGI env variables for the neuron precompiled models to
|
||||||
|
work properly
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
args = parse_cmdline_and_set_env()
|
||||||
|
|
||||||
|
for env_var in env_vars:
|
||||||
|
if not os.getenv(env_var):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
logger.info("All env vars %s already set, skipping, user know what they are doing", env_vars)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
cache_dir = constants.HF_HUB_CACHE
|
||||||
|
|
||||||
|
logger.info("Cache dir %s, model %s", cache_dir, args.model_id)
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
|
||||||
|
neuron_config = getattr(config, "neuron", None)
|
||||||
|
if neuron_config is not None:
|
||||||
|
compatible = check_env_and_neuron_config_compatibility(neuron_config, check_compiler_version=False)
|
||||||
|
if not compatible:
|
||||||
|
env_dict = get_env_dict()
|
||||||
|
msg = (
|
||||||
|
"Invalid neuron config and env. Config {}, env {}, available cores {}, neuronxcc version {}"
|
||||||
|
).format(neuron_config, env_dict, available_cores, neuronxcc_version)
|
||||||
|
logger.error(msg)
|
||||||
|
raise Exception(msg)
|
||||||
|
else:
|
||||||
|
neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
|
||||||
|
|
||||||
|
if not neuron_config:
|
||||||
|
msg = ("No compatible neuron config found. Provided env {}, available cores {}, neuronxcc version {}").format(
|
||||||
|
get_env_dict(), available_cores, neuronxcc_version
|
||||||
|
)
|
||||||
|
logger.error(msg)
|
||||||
|
raise Exception(msg)
|
||||||
|
|
||||||
|
neuron_config_to_env(neuron_config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -50,6 +50,8 @@
|
|||||||
title: Train Medusa
|
title: Train Medusa
|
||||||
title: Tutorials
|
title: Tutorials
|
||||||
- sections:
|
- sections:
|
||||||
|
- local: backends/neuron
|
||||||
|
title: Neuron
|
||||||
- local: backends/trtllm
|
- local: backends/trtllm
|
||||||
title: TensorRT-LLM
|
title: TensorRT-LLM
|
||||||
title: Backends
|
title: Backends
|
||||||
|
182
docs/source/backends/neuron.md
Normal file
182
docs/source/backends/neuron.md
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
# Neuron backend for AWS Trainium and Inferentia
|
||||||
|
|
||||||
|
The Neuron backend allows the deployment of TGI on AWS Trainium and Inferentia family of chips.
|
||||||
|
|
||||||
|
The following hardware targets are supported:
|
||||||
|
- Trainium 1,
|
||||||
|
- Inferentia 2.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
The basic TGI features are supported:
|
||||||
|
|
||||||
|
- continuous batching,
|
||||||
|
- token streaming,
|
||||||
|
- greedy search and multinomial sampling using [transformers](https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation).
|
||||||
|
|
||||||
|
|
||||||
|
## Deploy the service from the Hugging Face hub
|
||||||
|
|
||||||
|
The simplest way to deploy the NeuronX TGI service for a specific model is to follow the
|
||||||
|
deployment instructions in the model card:
|
||||||
|
|
||||||
|
- click on the "Deploy" button on the right,
|
||||||
|
- select your deployment service ("Inference Endpoints" and "SageMaker" are supported),
|
||||||
|
- select "AWS Trainum & Inferentia",
|
||||||
|
- follow the instructions.
|
||||||
|
|
||||||
|
|
||||||
|
## Deploy the service on a dedicated host
|
||||||
|
|
||||||
|
The service is launched simply by running the text-generation-inference container with two sets of parameters:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run <system_parameters> ghcr.io/huggingface/text-generation-inference:latest-neuron <service_parameters>
|
||||||
|
```
|
||||||
|
|
||||||
|
- system parameters are used to map ports, volumes and devices between the host and the service,
|
||||||
|
- service parameters are forwarded to the `text-generation-launcher`.
|
||||||
|
|
||||||
|
When deploying a service, you will need a pre-compiled Neuron model. The Neuron TGI backend supports two main modes of operation:
|
||||||
|
|
||||||
|
- you can either deploy the service on a model that has already been exported to Neuron,
|
||||||
|
- or alternatively you can take advantage of the Neuron Model Cache to export your own model.
|
||||||
|
|
||||||
|
### Common system parameters
|
||||||
|
|
||||||
|
Whenever you launch a TGI service, we highly recommend you to mount a shared volume mounted as `/data` in the container: this is where
|
||||||
|
the models will be cached to speed up further instantiations of the service.
|
||||||
|
|
||||||
|
Note also that enough neuron devices should be visible by the container.The simplest way to achieve that is to launch the service in `privileged` mode to get access to all neuron devices.
|
||||||
|
Alternatively, each device can be explicitly exposed using the `--device` option.
|
||||||
|
|
||||||
|
Finally, you might want to export the `HF_TOKEN` if you want to access gated repositories.
|
||||||
|
|
||||||
|
Here is an example of a service instantiation:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -p 8080:80 \
|
||||||
|
-v $(pwd)/data:/data \
|
||||||
|
--privileged \
|
||||||
|
-e HF_TOKEN=${HF_TOKEN} \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:latest-neuron \
|
||||||
|
<service_parameters>
|
||||||
|
```
|
||||||
|
|
||||||
|
If you only want to map the first device, the launch command becomes:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -p 8080:80 \
|
||||||
|
-v $(pwd)/data:/data \
|
||||||
|
--device=/dev/neuron0 \
|
||||||
|
-e HF_TOKEN=${HF_TOKEN} \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:latest-neuron \
|
||||||
|
<service_parameters>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron) (recommended)
|
||||||
|
|
||||||
|
We maintain a Neuron Model Cache of the most popular architecture and deployment parameters under [aws-neuron/optimum-neuron-cache](https://huggingface.co/aws-neuron/optimum-neuron-cache).
|
||||||
|
|
||||||
|
If you just want to try the service quickly using a model without exporting it to Neuron first, it is thus still possible, pending some conditions:
|
||||||
|
- you must specify the export parameters when launching the service (or use default parameters),
|
||||||
|
- the model configuration must be cached.
|
||||||
|
|
||||||
|
The snippet below shows how you can deploy a service from a hub standard model:
|
||||||
|
|
||||||
|
```
|
||||||
|
export HF_TOKEN=<YOUR_TOKEN>
|
||||||
|
docker run -p 8080:80 \
|
||||||
|
-v $(pwd)/data:/data \
|
||||||
|
--privileged \
|
||||||
|
-e HF_TOKEN=${HF_TOKEN} \
|
||||||
|
-e HF_AUTO_CAST_TYPE="fp16" \
|
||||||
|
-e HF_NUM_CORES=2 \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
|
||||||
|
--model-id meta-llama/Meta-Llama-3-8B \
|
||||||
|
--max-batch-size 1 \
|
||||||
|
--max-input-length 3164 \
|
||||||
|
--max-total-tokens 4096
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using a model exported to a local path
|
||||||
|
|
||||||
|
Alternatively, you can first [export the model to neuron format](https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model#exporting-neuron-models-using-text-generation-inference:latest-neuron) locally.
|
||||||
|
|
||||||
|
You can then deploy the service inside the shared volume:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -p 8080:80 \
|
||||||
|
-v $(pwd)/data:/data \
|
||||||
|
--privileged \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
|
||||||
|
--model-id /data/<neuron_model_path>
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: You don't need to specify any service parameters, as they will all be deduced from the model export configuration.
|
||||||
|
|
||||||
|
### Using a neuron model from the 🤗 [HuggingFace Hub](https://huggingface.co/)
|
||||||
|
|
||||||
|
The easiest way to share a neuron model inside your organization is to push it on the Hugging Face hub, so that it can be deployed directly without requiring an export.
|
||||||
|
|
||||||
|
The snippet below shows how you can deploy a service from a hub neuron model:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -p 8080:80 \
|
||||||
|
-v $(pwd)/data:/data \
|
||||||
|
--privileged \
|
||||||
|
-e HF_TOKEN=${HF_TOKEN} \
|
||||||
|
ghcr.io/huggingface/text-generation-inference:latest-neuron:latest \
|
||||||
|
--model-id <organization>/<neuron-model>
|
||||||
|
```
|
||||||
|
|
||||||
|
### Choosing service parameters
|
||||||
|
|
||||||
|
Use the following command to list the available service parameters:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run ghcr.io/huggingface/text-generation-inference:latest-neuron --help
|
||||||
|
```
|
||||||
|
|
||||||
|
The configuration of an inference endpoint is always a compromise between throughput and latency: serving more requests in parallel will allow a higher throughput, but it will increase the latency.
|
||||||
|
|
||||||
|
The neuron models have static input dimensions `[batch_size, max_length]`.
|
||||||
|
|
||||||
|
This adds several restrictions to the following parameters:
|
||||||
|
|
||||||
|
- `--max-batch-size` must be set to `batch size`,
|
||||||
|
- `--max-input-length` must be lower than `max_length`,
|
||||||
|
- `--max-total-tokens` must be set to `max_length` (it is per-request).
|
||||||
|
|
||||||
|
Although not strictly necessary, but important for efficient prefilling:
|
||||||
|
|
||||||
|
- `--max-batch-prefill-tokens` should be set to `batch_size` * `max-input-length`.
|
||||||
|
|
||||||
|
### Choosing the correct batch size
|
||||||
|
|
||||||
|
As seen in the previous paragraph, neuron model static batch size has a direct influence on the endpoint latency and throughput.
|
||||||
|
|
||||||
|
Please refer to [text-generation-inference](https://github.com/huggingface/text-generation-inference) for optimization hints.
|
||||||
|
|
||||||
|
Note that the main constraint is to be able to fit the model for the specified `batch_size` within the total device memory available
|
||||||
|
on your instance (16GB per neuron core, with 2 cores per device).
|
||||||
|
|
||||||
|
## Query the service
|
||||||
|
|
||||||
|
You can query the model using either the `/generate` or `/generate_stream` routes:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl 127.0.0.1:8080/generate \
|
||||||
|
-X POST \
|
||||||
|
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
|
||||||
|
-H 'Content-Type: application/json'
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
curl 127.0.0.1:8080/generate_stream \
|
||||||
|
-X POST \
|
||||||
|
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
|
||||||
|
-H 'Content-Type: application/json'
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: replace 127.0.0.1:8080 with your actual IP address and port.
|
Loading…
Reference in New Issue
Block a user