mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-23 16:02:10 +00:00
* feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
170 lines
5.0 KiB
Docker
170 lines
5.0 KiB
Docker
# Fetch and extract the TGI sources
|
|
FROM alpine AS tgi
|
|
RUN mkdir -p /tgi
|
|
|
|
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
|
|
FROM alpine AS optimum-neuron
|
|
RUN mkdir -p /optimum-neuron
|
|
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
|
|
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
|
|
|
|
# Build cargo components (adapted from TGI original Dockerfile)
|
|
# Note: we cannot use the cargo-chef base image as it uses python 3.11
|
|
FROM ubuntu:22.04 AS chef
|
|
|
|
RUN apt-get update -y \
|
|
&& apt-get install -y --no-install-recommends \
|
|
curl ca-certificates build-essential \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& apt-get clean
|
|
|
|
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.80.1 --profile minimal -y
|
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
|
RUN cargo install cargo-chef --locked
|
|
|
|
WORKDIR /usr/src
|
|
|
|
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
|
|
|
FROM chef AS planner
|
|
COPY backends/neuron/Cargo.toml Cargo.toml
|
|
COPY Cargo.lock Cargo.lock
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
COPY proto proto
|
|
COPY router router
|
|
COPY backends backends
|
|
COPY launcher launcher
|
|
RUN cargo chef prepare --recipe-path recipe.json
|
|
|
|
FROM chef AS builder
|
|
|
|
RUN apt-get update -y \
|
|
&& apt-get install -y --no-install-recommends \
|
|
unzip python3-dev libssl-dev pkg-config \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& apt-get clean
|
|
|
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
|
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
|
|
rm -f $PROTOC_ZIP
|
|
|
|
COPY backends/neuron/Cargo.toml Cargo.toml
|
|
COPY --from=planner /usr/src/recipe.json recipe.json
|
|
RUN cargo chef cook --release --recipe-path recipe.json
|
|
|
|
COPY Cargo.lock Cargo.lock
|
|
COPY rust-toolchain.toml rust-toolchain.toml
|
|
COPY proto proto
|
|
COPY router router
|
|
COPY backends backends
|
|
COPY launcher launcher
|
|
RUN cargo build --release
|
|
|
|
# Python base image
|
|
FROM ubuntu:22.04 AS base
|
|
|
|
RUN apt-get update -y \
|
|
&& apt-get install -y --no-install-recommends \
|
|
python3-pip \
|
|
python3-setuptools \
|
|
python-is-python3 \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& apt-get clean
|
|
RUN pip3 --no-cache-dir install --upgrade pip
|
|
|
|
# Python server build image
|
|
FROM base AS pyserver
|
|
|
|
RUN apt-get update -y \
|
|
&& apt-get install -y --no-install-recommends \
|
|
make \
|
|
python3-venv \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& apt-get clean
|
|
|
|
RUN install -d /pyserver
|
|
WORKDIR /pyserver
|
|
COPY backends/neuron/server server
|
|
COPY proto proto
|
|
RUN pip3 install -r server/build-requirements.txt
|
|
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package
|
|
|
|
# Neuron base image (used for deployment)
|
|
FROM base AS neuron
|
|
|
|
# Install system prerequisites
|
|
RUN apt-get update -y \
|
|
&& apt-get install -y --no-install-recommends \
|
|
gnupg2 \
|
|
wget \
|
|
python3-dev \
|
|
libexpat1 \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& apt-get clean
|
|
|
|
RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
|
|
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
|
|
|
|
# Install neuronx packages
|
|
RUN apt-get update -y \
|
|
&& apt-get install -y --no-install-recommends \
|
|
aws-neuronx-dkms=2.18.20.0 \
|
|
aws-neuronx-collectives=2.22.33.0-d2128d1aa \
|
|
aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
|
|
aws-neuronx-tools=2.19.0.0 \
|
|
libxml2 \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& apt-get clean
|
|
|
|
ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
|
|
|
|
# Install manually torch CPU version to avoid pulling CUDA
|
|
RUN pip3 install \
|
|
torch==2.1.2 \
|
|
torchvision==0.16.2 \
|
|
--index-url https://download.pytorch.org/whl/cpu
|
|
|
|
RUN pip3 install \
|
|
neuronx-cc==2.15.143.0 \
|
|
torch-neuronx==2.1.2.2.3.2 \
|
|
transformers-neuronx==0.12.313 \
|
|
neuronx-distributed==0.9.0 \
|
|
libneuronxla==2.0.5347.0 \
|
|
--extra-index-url=https://pip.repos.neuron.amazonaws.com
|
|
|
|
# Install HuggingFace packages
|
|
RUN pip3 install \
|
|
hf_transfer huggingface_hub
|
|
|
|
# Install optimum-neuron
|
|
COPY --from=optimum-neuron /optimum-neuron optimum-neuron
|
|
RUN pip3 install ./optimum-neuron
|
|
|
|
# TGI base env
|
|
ENV HUGGINGFACE_HUB_CACHE=/tmp \
|
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
|
PORT=80
|
|
|
|
# Disable color logs as they are not supported by CloudWatch
|
|
ENV LOGURU_COLORIZE=NO
|
|
ENV LOG_COLORIZE=0
|
|
|
|
# Install router
|
|
COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
|
|
# Install launcher
|
|
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
|
# Install python server
|
|
COPY --from=pyserver /pyserver/build/dist dist
|
|
RUN pip install dist/text_generation_server*.tar.gz
|
|
|
|
# Final image
|
|
FROM neuron
|
|
|
|
COPY backends/neuron/tgi_env.py /tgi_env.py
|
|
COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
|
|
RUN chmod +x /tgi-entrypoint.sh
|
|
|
|
ENTRYPOINT ["/tgi-entrypoint.sh"]
|