diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 5a22fab3d..b2e0eb2cf 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -5,7 +5,7 @@ RUN mkdir -p /tgi # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments FROM alpine AS optimum-neuron RUN mkdir -p /optimum-neuron -ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.0.28.tar.gz /optimum-neuron/sources.tar.gz +ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.1.0.tar.gz /optimum-neuron/sources.tar.gz RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1 # Build cargo components (adapted from TGI original Dockerfile) @@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU # Install neuronx packages RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ - aws-neuronx-dkms=2.18.20.0 \ - aws-neuronx-collectives=2.22.33.0-d2128d1aa \ - aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 \ - aws-neuronx-tools=2.19.0.0 \ + aws-neuronx-dkms=2.19.64.0 \ + aws-neuronx-collectives=2.23.135.0-3e70920f2 \ + aws-neuronx-runtime-lib=2.23.112.0-9b5179492 \ + aws-neuronx-tools=2.20.204.0 \ libxml2 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean @@ -120,16 +120,16 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" # Install manually torch CPU version to avoid pulling CUDA RUN pip3 install \ - torch==2.1.2 \ - torchvision==0.16.2 \ + torch==2.5.1 \ + torchvision==0.20.1 \ --index-url https://download.pytorch.org/whl/cpu RUN pip3 install \ - neuronx-cc==2.15.143.0 \ - torch-neuronx==2.1.2.2.3.2 \ - transformers-neuronx==0.12.313 \ - neuronx-distributed==0.9.0 \ - libneuronxla==2.0.5347.0 \ + neuronx-cc==2.16.372.0 \ + torch-neuronx==2.5.1.2.4.0 \ + transformers-neuronx==0.13.322 \ + neuronx-distributed==0.10.1 \ + libneuronxla==2.1.681.0 \ --extra-index-url=https://pip.repos.neuron.amazonaws.com # Install HuggingFace packages diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile index 6c5002ce8..066749713 100644 --- a/backends/neuron/Makefile +++ b/backends/neuron/Makefile @@ -25,6 +25,7 @@ image: --ulimit nofile=100000:100000 \ --build-arg VERSION=$(VERSION) \ -t text-generation-inference:$(VERSION)-neuron ${root_dir} + docker tag text-generation-inference:$(VERSION)-neuron text-generation-inference:latest-neuron install_server: make -C ${mkfile_dir}/server install VERSION:=${VERSION} diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py index 6a1b49904..f08043561 100644 --- a/integration-tests/neuron/test_generate.py +++ b/integration-tests/neuron/test_generate.py @@ -49,17 +49,11 @@ async def test_model_single_request(tgi_service): max_new_tokens=128, seed=42, ) - sample_expectations = { - "gpt2": "Deep Learning", - "llama": "Deep Learning", - "mistral": "Deep learning", - "qwen2": "Deep Learning", - "granite": "Deep learning", - } - assert sample_expectations[service_name] in response + # The response must be different + assert not response.startswith(greedy_expectations[service_name]) - # Sampling with stop sequence - stop_sequence = sample_expectations[service_name][-5:] + # Sampling with stop sequence (using one of the words returned from the previous test) + stop_sequence = response.split(" ")[-5] response = await tgi_service.client.text_generation( "What is Deep Learning?", do_sample=True,