text-generation-inference/Dockerfile.neuron

# Fetch and extract the TGI sources
FROM alpine AS tgi
RUN mkdir -p /tgi

# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
FROM alpine AS optimum-neuron
RUN mkdir -p /optimum-neuron
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1

# Build cargo components (adapted from TGI original Dockerfile)
# Note: we cannot use the cargo-chef base image as it uses python 3.11
FROM ubuntu:22.04 AS chef

RUN apt-get update -y \
 && apt-get install -y --no-install-recommends \
    curl ca-certificates build-essential \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain 1.85.0 --profile minimal -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN cargo install cargo-chef --locked

WORKDIR /usr/src

FROM chef AS planner
COPY backends/neuron/Cargo.toml Cargo.toml
COPY Cargo.lock Cargo.lock
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

RUN apt-get update -y \
 && apt-get install -y --no-install-recommends \
    unzip python3-dev libssl-dev pkg-config \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY backends/neuron/Cargo.toml Cargo.toml
COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json

COPY Cargo.lock Cargo.lock
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY router router
COPY backends backends
COPY launcher launcher
RUN cargo build --release

# Python base image
FROM ubuntu:22.04 AS base

RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    python3-pip \
    python3-setuptools \
    python-is-python3 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
RUN pip3 --no-cache-dir install --upgrade pip

# Python server build image
FROM base AS pyserver

RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    make \
    python3-venv \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN install -d /pyserver
WORKDIR /pyserver
COPY backends/neuron/server server
COPY proto proto
RUN pip3 install -r server/build-requirements.txt
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package

# Neuron base image (used for deployment)
FROM base AS neuron

# Install system prerequisites
RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    gnupg2 \
    wget \
    python3-dev \
    libexpat1 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list
RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

# Install neuronx packages
RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
    aws-neuronx-dkms=2.18.20.0 \
    aws-neuronx-collectives=2.22.33.0-d2128d1aa \
    aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
    aws-neuronx-tools=2.19.0.0 \
    libxml2 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"

# Install manually torch CPU version to avoid pulling CUDA
RUN pip3 install \
    torch==2.1.2 \
    torchvision==0.16.2 \
    --index-url https://download.pytorch.org/whl/cpu

RUN pip3 install \
    neuronx-cc==2.15.143.0 \
    torch-neuronx==2.1.2.2.3.2 \
    transformers-neuronx==0.12.313 \
    neuronx-distributed==0.9.0 \
    libneuronxla==2.0.5347.0 \
    --extra-index-url=https://pip.repos.neuron.amazonaws.com

# Install HuggingFace packages
RUN pip3 install \
    hf_transfer huggingface_hub

# Install optimum-neuron
COPY --from=optimum-neuron /optimum-neuron optimum-neuron
RUN pip3 install ./optimum-neuron

# TGI base env
ENV HUGGINGFACE_HUB_CACHE=/tmp \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

# Disable color logs as they are not supported by CloudWatch
ENV LOGURU_COLORIZE=NO
ENV LOG_COLORIZE=0

# Install router
COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
# Install python server
COPY --from=pyserver /pyserver/build/dist dist
RUN pip install dist/text_generation_server*.tar.gz

# Final image
FROM neuron

COPY backends/neuron/tgi_env.py /tgi_env.py
COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh

ENTRYPOINT ["/tgi-entrypoint.sh"]
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`# Fetch and extract the TGI sources`
			`FROM alpine AS tgi`
			`RUN mkdir -p /tgi`

			`# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments`
			`FROM alpine AS optimum-neuron`
			`RUN mkdir -p /optimum-neuron`
			`ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz`
			`RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1`

			`# Build cargo components (adapted from TGI original Dockerfile)`
			`# Note: we cannot use the cargo-chef base image as it uses python 3.11`
			`FROM ubuntu:22.04 AS chef`

			`RUN apt-get update -y \`
			`&& apt-get install -y --no-install-recommends \`
			`curl ca-certificates build-essential \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& apt-get clean`

fix(neuron): explicitly install toolchain (#3072) * fix(neuron): explicitly install toolchain * ci(neuron): trigger CI when Dockerfile is modified 2025-03-05 10:46:58 +00:00			`RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh -s -- --default-toolchain 1.85.0 --profile minimal -y`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`ENV PATH="/root/.cargo/bin:${PATH}"`
			`RUN cargo install cargo-chef --locked`

			`WORKDIR /usr/src`

			`FROM chef AS planner`
			`COPY backends/neuron/Cargo.toml Cargo.toml`
			`COPY Cargo.lock Cargo.lock`
			`COPY rust-toolchain.toml rust-toolchain.toml`
			`COPY proto proto`
			`COPY router router`
			`COPY backends backends`
			`COPY launcher launcher`
			`RUN cargo chef prepare --recipe-path recipe.json`

			`FROM chef AS builder`

			`RUN apt-get update -y \`
			`&& apt-get install -y --no-install-recommends \`
			`unzip python3-dev libssl-dev pkg-config \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& apt-get clean`

			`RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \`
			`curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \`
			`unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \`
			`unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \`
			`rm -f $PROTOC_ZIP`

			`COPY backends/neuron/Cargo.toml Cargo.toml`
			`COPY --from=planner /usr/src/recipe.json recipe.json`
			`RUN cargo chef cook --release --recipe-path recipe.json`

			`COPY Cargo.lock Cargo.lock`
			`COPY rust-toolchain.toml rust-toolchain.toml`
			`COPY proto proto`
			`COPY router router`
			`COPY backends backends`
			`COPY launcher launcher`
			`RUN cargo build --release`

			`# Python base image`
			`FROM ubuntu:22.04 AS base`

			`RUN apt-get update -y \`
			`&& apt-get install -y --no-install-recommends \`
			`python3-pip \`
			`python3-setuptools \`
			`python-is-python3 \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& apt-get clean`
			`RUN pip3 --no-cache-dir install --upgrade pip`

			`# Python server build image`
			`FROM base AS pyserver`

			`RUN apt-get update -y \`
			`&& apt-get install -y --no-install-recommends \`
			`make \`
			`python3-venv \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& apt-get clean`

			`RUN install -d /pyserver`
			`WORKDIR /pyserver`
			`COPY backends/neuron/server server`
			`COPY proto proto`
			`RUN pip3 install -r server/build-requirements.txt`
			`RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server package`

			`# Neuron base image (used for deployment)`
			`FROM base AS neuron`

			`# Install system prerequisites`
			`RUN apt-get update -y \`
			`&& apt-get install -y --no-install-recommends \`
			`gnupg2 \`
			`wget \`
			`python3-dev \`
			`libexpat1 \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& apt-get clean`

			`RUN echo "deb https://apt.repos.neuron.amazonaws.com jammy main" > /etc/apt/sources.list.d/neuron.list`
			`RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \| apt-key add -`

			`# Install neuronx packages`
			`RUN apt-get update -y \`
			`&& apt-get install -y --no-install-recommends \`
			`aws-neuronx-dkms=2.18.20.0 \`
			`aws-neuronx-collectives=2.22.33.0-d2128d1aa \`
			`aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \`
			`aws-neuronx-tools=2.19.0.0 \`
			`libxml2 \`
			`&& rm -rf /var/lib/apt/lists/* \`
			`&& apt-get clean`

			`ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"`

			`# Install manually torch CPU version to avoid pulling CUDA`
			`RUN pip3 install \`
			`torch==2.1.2 \`
			`torchvision==0.16.2 \`
			`--index-url https://download.pytorch.org/whl/cpu`

			`RUN pip3 install \`
			`neuronx-cc==2.15.143.0 \`
			`torch-neuronx==2.1.2.2.3.2 \`
			`transformers-neuronx==0.12.313 \`
			`neuronx-distributed==0.9.0 \`
			`libneuronxla==2.0.5347.0 \`
			`--extra-index-url=https://pip.repos.neuron.amazonaws.com`

			`# Install HuggingFace packages`
			`RUN pip3 install \`
			`hf_transfer huggingface_hub`

			`# Install optimum-neuron`
			`COPY --from=optimum-neuron /optimum-neuron optimum-neuron`
			`RUN pip3 install ./optimum-neuron`

			`# TGI base env`
			`ENV HUGGINGFACE_HUB_CACHE=/tmp \`
			`HF_HUB_ENABLE_HF_TRANSFER=1 \`
			`PORT=80`

			`# Disable color logs as they are not supported by CloudWatch`
			`ENV LOGURU_COLORIZE=NO`
			`ENV LOG_COLORIZE=0`

			`# Install router`
			`COPY --from=builder /usr/src/target/release/text-generation-router-v2 /usr/local/bin/text-generation-router`
			`# Install launcher`
			`COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher`
			`# Install python server`
			`COPY --from=pyserver /pyserver/build/dist dist`
			`RUN pip install dist/text_generation_server*.tar.gz`

			`# Final image`
			`FROM neuron`

			`COPY backends/neuron/tgi_env.py /tgi_env.py`
			`COPY backends/neuron/tgi-entrypoint.sh /tgi-entrypoint.sh`
			`RUN chmod +x /tgi-entrypoint.sh`

			`ENTRYPOINT ["/tgi-entrypoint.sh"]`