diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 95d1f4e2..9ca7aad9 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -5,7 +5,7 @@ RUN mkdir -p /tgi # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments FROM alpine AS optimum-neuron RUN mkdir -p /optimum-neuron -ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.2.tar.gz /optimum-neuron/sources.tar.gz +ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.3.0.tar.gz /optimum-neuron/sources.tar.gz RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1 # Build cargo components (adapted from TGI original Dockerfile) @@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU # Install neuronx packages RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ - aws-neuronx-dkms=2.20.28.0 \ - aws-neuronx-collectives=2.24.59.0-838c7fc8b \ - aws-neuronx-runtime-lib=2.24.53.0-f239092cc \ - aws-neuronx-tools=2.22.61.0 \ + aws-neuronx-dkms=2.22.2.0 \ + aws-neuronx-collectives=2.26.43.0-47cc904ea \ + aws-neuronx-runtime-lib=2.26.42.0-2ff3b5c7d \ + aws-neuronx-tools=2.24.54.0 \ libxml2 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean @@ -120,15 +120,15 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" # Install manually torch CPU version to avoid pulling CUDA RUN pip3 install \ - torch==2.5.1 \ - torchvision==0.20.1 \ + torch==2.7.0 \ + torchvision==0.22.0 \ --index-url https://download.pytorch.org/whl/cpu RUN pip3 install \ - neuronx-cc==2.17.194.0 \ - torch-neuronx==2.5.1.2.6.0 \ - neuronx-distributed==0.11.0 \ - libneuronxla==2.2.1630.0 \ + neuronx-cc==2.19.8089.0+8ab9f450 \ + torch-neuronx==2.7.0.2.8.6734+ac864f72 \ + neuronx-distributed==0.13.14393+b8569585 \ + libneuronxla==2.2.4410.0+835a67fb \ --extra-index-url=https://pip.repos.neuron.amazonaws.com # Install HuggingFace packages diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py index b864e3ec..5bb00a84 100644 --- a/backends/neuron/tests/server/test_decode.py +++ b/backends/neuron/tests/server/test_decode.py @@ -11,7 +11,14 @@ def test_decode(neuron_model_config): for do_sample in [True, False]: mode = "sample" if do_sample else "greedy" print(f"{config_name}[{mode}]") - _test_decode(config_name, generator, do_sample) + generated_text = _test_decode(config_name, generator, do_sample) + if not do_sample: + expected_text = { + "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility", + "qwen2": " I was sitting in my room, staring at the clock, when a knock at the door. I", + "granite": "\n\nThis opening line is from George Orwell's dystopian novel, \"1", + }[config_name] + assert generated_text == expected_text generator.clear() @@ -21,7 +28,11 @@ def _test_decode(config_name, generator, do_sample): ) max_new_tokens = 20 request = create_request( - id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample + id=0, + inputs=input_text, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=0.9, ) max_length = generator.model.neuron_config.sequence_length batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) @@ -38,18 +49,4 @@ def _test_decode(config_name, generator, do_sample): output = generations[0].generated_text assert output.generated_tokens == max_new_tokens assert output.finish_reason == 0 - if do_sample: - expected_text = { - "llama": " I sat alone in the café", - "qwen2": " The air was so still", - "granite": "1984, George Orwell", - }[config_name] - assert expected_text in output.text - else: - print(output.text) - expected_text = { - "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility", - "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a", - "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198", - }[config_name] - assert output.text == expected_text + return output.text diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py index c9ecd1c8..796e4817 100644 --- a/backends/neuron/tests/server/test_prefill.py +++ b/backends/neuron/tests/server/test_prefill.py @@ -44,23 +44,17 @@ def _test_prefill(config_name, generator, batch_size, do_sample): # because of static batching assert next_batch.max_tokens == batch_size * max_length assert len(generations) == batch_size - if do_sample: - expectations = { - "llama": [358, " I"], - "qwen2": [576, " The"], - "granite": [308, " ("], - }[config_name] - else: - expectations = { - "llama": [578, " The"], - "qwen2": [358, " I"], - "granite": [203, "\n"], - }[config_name] - for g in generations: - tokens = g.tokens - assert tokens.ids[0] == expectations[0] - assert tokens.texts[0] == expectations[1] - + expectations = { + "llama": [578, " The"], + "qwen2": [358, " I"], + "granite": [203, "\n"], + }[config_name] + # Greedy mode should always generate the same output + if not do_sample: + for g in generations: + tokens = g.tokens + assert tokens.ids[0] == expectations[0] + assert tokens.texts[0] == expectations[1] def test_prefill_truncate(neuron_model_config): config_name = neuron_model_config["name"] @@ -88,8 +82,8 @@ def test_prefill_truncate(neuron_model_config): # be different because of the truncation expectations = { "llama": [" He", "iens", "\x08", " He"], - "qwen2": [" He", " The", " He", " He"], - "granite": ["\n", "\n", " I", " He"], + "qwen2": [" He", "<|endoftext|>", " ", " The"], + "granite": ["\n", "\n", "\n", "\n"], }[config_name] for i, g in enumerate(generations): tokens = g.tokens diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py index d96aa36e..36957cb5 100644 --- a/integration-tests/neuron/test_generate.py +++ b/integration-tests/neuron/test_generate.py @@ -22,22 +22,22 @@ async def test_model_single_request(tgi_service): greedy_expectations = { "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial", "qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks", - "granite": "\n\nDeep learning is a subset of machine learning techniques based on artificial neural networks", - "qwen3": " A Deep Learning is a subset of machine learning that uses neural networks with multiple layers to", + "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and", + "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural", "phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating", } assert response.generated_text == greedy_expectations[service_name] # Greedy bounded with input - response = await tgi_service.client.text_generation( + greedy_response = await tgi_service.client.text_generation( "What is Deep Learning?", max_new_tokens=17, return_full_text=True, details=True, decoder_input_details=True, ) - assert response.details.generated_tokens == 17 - assert response.generated_text == prompt + greedy_expectations[service_name] + assert greedy_response.details.generated_tokens == 17 + assert greedy_response.generated_text == prompt + greedy_expectations[service_name] # Sampling response = await tgi_service.client.text_generation( @@ -52,16 +52,12 @@ async def test_model_single_request(tgi_service): # The response must be different assert not response.startswith(greedy_expectations[service_name]) - # Sampling with stop sequence (using one of the words returned from the previous test) - stop_sequence = response.split(" ")[-5] + # Greedy with stop sequence (using one of the words returned from the previous test) + stop_sequence = greedy_response.generated_text.split(" ")[-5] response = await tgi_service.client.text_generation( "What is Deep Learning?", - do_sample=True, - top_k=50, - top_p=0.9, - repetition_penalty=1.2, + do_sample=False, max_new_tokens=128, - seed=42, stop_sequences=[stop_sequence], ) assert response.endswith(stop_sequence) @@ -81,8 +77,8 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load): expectations = { "llama": "Deep learning is a subset of machine learning that uses artificial", "qwen2": "Deep Learning is a subset of Machine Learning that involves", - "granite": "Deep learning is a subset of machine learning techniques", - "qwen3": "Deep Learning is a subset of machine learning that uses neural networks", + "granite": "Deep Learning is a subset of machine learning that is inspired by the structure and", + "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural", "phi3": "Deep learning is a subfield of machine learning that focuses on creating", } expected = expectations[tgi_service.client.service_name]