Update neuron backend (#3098)

* feat(neuron): use AWS Neuron SDK 2.21.1

* feat(neuron): bump optimum-neuron version

* feat(neuron): tag latest image for local tests

* test(neuron): simplify sampling test
This commit is contained in:
David Corvoysier 2025-03-12 09:53:15 +01:00 committed by GitHub
parent 5c5528e362
commit f01dc9e743
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 17 additions and 22 deletions

View File

@ -5,7 +5,7 @@ RUN mkdir -p /tgi
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
FROM alpine AS optimum-neuron
RUN mkdir -p /optimum-neuron
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
# Build cargo components (adapted from TGI original Dockerfile)
@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
# Install neuronx packages
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
aws-neuronx-dkms=2.18.20.0 \
aws-neuronx-collectives=2.22.33.0-d2128d1aa \
aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
aws-neuronx-tools=2.19.0.0 \
aws-neuronx-dkms=2.19.64.0 \
aws-neuronx-collectives=2.23.135.0-3e70920f2 \
aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
aws-neuronx-tools=2.20.204.0 \
libxml2 \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
# Install manually torch CPU version to avoid pulling CUDA
RUN pip3 install \
torch==2.1.2 \
torchvision==0.16.2 \
torch==2.5.1 \
torchvision==0.20.1 \
--index-url https://download.pytorch.org/whl/cpu
RUN pip3 install \
neuronx-cc==2.15.143.0 \
torch-neuronx==2.1.2.2.3.2 \
transformers-neuronx==0.12.313 \
neuronx-distributed==0.9.0 \
libneuronxla==2.0.5347.0 \
neuronx-cc==2.16.372.0 \
torch-neuronx==2.5.1.2.4.0 \
transformers-neuronx==0.13.322 \
neuronx-distributed==0.10.1 \
libneuronxla==2.1.681.0 \
--extra-index-url=https://pip.repos.neuron.amazonaws.com
# Install HuggingFace packages

View File

@ -25,6 +25,7 @@ image:
--ulimit nofile=100000:100000 \
--build-arg VERSION=$(VERSION) \
-t text-generation-inference:$(VERSION)-neuron ${root_dir}
docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron
install_server:
make -C ${mkfile_dir}/server install VERSION:=${VERSION}

View File

@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service):
max_new_tokens=128,
seed=42,
)
sample_expectations = {
"gpt2": "Deep Learning",
"llama": "Deep Learning",
"mistral": "Deep learning",
"qwen2": "Deep Learning",
"granite": "Deep learning",
}
assert sample_expectations[service_name] in response
# The response must be different
assert not response.startswith(greedy_expectations[service_name])
# Sampling with stop sequence
stop_sequence = sample_expectations[service_name][-5:]
# Sampling with stop sequence (using one of the words returned from the previous test)
stop_sequence = response.split(" ")[-5]
response = await tgi_service.client.text_generation(
"What is Deep Learning?",
do_sample=True,