Update neuron backend (#3098)

* feat(neuron): use AWS Neuron SDK 2.21.1 * feat(neuron): bump optimum-neuron version * feat(neuron): tag latest image for local tests * test(neuron): simplify sampling test
2025-10-19 20:05:24 +00:00 · 2025-03-12 09:53:15 +01:00 · 2025-03-12 09:53:15 +01:00 · f01dc9e743
commit f01dc9e743
parent 5c5528e362
3 changed files with 17 additions and 22 deletions
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -5,7 +5,7 @@ RUN mkdir -p /tgi
 # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
 FROM alpine AS optimum-neuron
 RUN mkdir -p /optimum-neuron
-ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz
+ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz
 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1

 # Build cargo components (adapted from TGI original Dockerfile)
@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 # Install neuronx packages
 RUN apt-get update -y \
    && apt-get install -y --no-install-recommends \
-    aws-neuronx-dkms=2.18.20.0 \
-    aws-neuronx-collectives=2.22.33.0-d2128d1aa \
-    aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \
-    aws-neuronx-tools=2.19.0.0 \
+    aws-neuronx-dkms=2.19.64.0 \
+    aws-neuronx-collectives=2.23.135.0-3e70920f2 \
+    aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \
+    aws-neuronx-tools=2.20.204.0 \
    libxml2 \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"

 # Install manually torch CPU version to avoid pulling CUDA
 RUN pip3 install \
-    torch==2.1.2 \
-    torchvision==0.16.2 \
+    torch==2.5.1 \
+    torchvision==0.20.1 \
    --index-url https://download.pytorch.org/whl/cpu

 RUN pip3 install \
-    neuronx-cc==2.15.143.0 \
-    torch-neuronx==2.1.2.2.3.2 \
-    transformers-neuronx==0.12.313 \
-    neuronx-distributed==0.9.0 \
-    libneuronxla==2.0.5347.0 \
+    neuronx-cc==2.16.372.0 \
+    torch-neuronx==2.5.1.2.4.0 \
+    transformers-neuronx==0.13.322 \
+    neuronx-distributed==0.10.1 \
+    libneuronxla==2.1.681.0 \
    --extra-index-url=https://pip.repos.neuron.amazonaws.com

 # Install HuggingFace packages
--- a/backends/neuron/Makefile
+++ b/backends/neuron/Makefile
@ -25,6 +25,7 @@ image:
 				 --ulimit nofile=100000:100000 \
 				 --build-arg VERSION=$(VERSION) \
 				 -t text-generation-inference:$(VERSION)-neuron ${root_dir}
+	docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron

 install_server:
 	make -C ${mkfile_dir}/server install VERSION:=${VERSION}
--- a/integration-tests/neuron/test_generate.py
+++ b/integration-tests/neuron/test_generate.py
@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service):
        max_new_tokens=128,
        seed=42,
    )
-    sample_expectations = {
-        "gpt2": "Deep Learning",
-        "llama": "Deep Learning",
-        "mistral": "Deep learning",
-        "qwen2": "Deep Learning",
-        "granite": "Deep learning",
-    }
-    assert sample_expectations[service_name] in response
+    # The response must be different
+    assert not response.startswith(greedy_expectations[service_name])

-    # Sampling with stop sequence
-    stop_sequence = sample_expectations[service_name][-5:]
+    # Sampling with stop sequence (using one of the words returned from the previous test)
+    stop_sequence = response.split(" ")[-5]
    response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        do_sample=True,