Optimum neuron 0.3.0 (#3308)

* chore(neuron): update to optimum-neuron 0.3.0

Dependencies were changed accordingly, because Neuron SDK was updated to
v2.24.

* test: sample is not deterministic

Also modify the temperature in decode test to avoid granite early
stopping.

* test(neuron): adjust expectations after graph changes

* test(neuron): use greedy for stop sequences

---------

Co-authored-by: David Corvoysier <david@huggingface.co>
This commit is contained in:
Alvaro Moran 2025-08-26 11:07:47 +02:00 committed by GitHub
parent d618424d50
commit 8801ba12cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 48 additions and 61 deletions

View File

@ -5,7 +5,7 @@ RUN mkdir -p /tgi
# Fetch the optimum-neuron sources directly to avoid relying on pypi deployments # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments
FROM alpine AS optimum-neuron FROM alpine AS optimum-neuron
RUN mkdir -p /optimum-neuron RUN mkdir -p /optimum-neuron
ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.2.tar.gz /optimum-neuron/sources.tar.gz ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.3.0.tar.gz /optimum-neuron/sources.tar.gz
RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1 RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1
# Build cargo components (adapted from TGI original Dockerfile) # Build cargo components (adapted from TGI original Dockerfile)
@ -108,10 +108,10 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
# Install neuronx packages # Install neuronx packages
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
aws-neuronx-dkms=2.20.28.0 \ aws-neuronx-dkms=2.22.2.0 \
aws-neuronx-collectives=2.24.59.0-838c7fc8b \ aws-neuronx-collectives=2.26.43.0-47cc904ea \
aws-neuronx-runtime-lib=2.24.53.0-f239092cc \ aws-neuronx-runtime-lib=2.26.42.0-2ff3b5c7d \
aws-neuronx-tools=2.22.61.0 \ aws-neuronx-tools=2.24.54.0 \
libxml2 \ libxml2 \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& apt-get clean && apt-get clean
@ -120,15 +120,15 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
# Install manually torch CPU version to avoid pulling CUDA # Install manually torch CPU version to avoid pulling CUDA
RUN pip3 install \ RUN pip3 install \
torch==2.5.1 \ torch==2.7.0 \
torchvision==0.20.1 \ torchvision==0.22.0 \
--index-url https://download.pytorch.org/whl/cpu --index-url https://download.pytorch.org/whl/cpu
RUN pip3 install \ RUN pip3 install \
neuronx-cc==2.17.194.0 \ neuronx-cc==2.19.8089.0+8ab9f450 \
torch-neuronx==2.5.1.2.6.0 \ torch-neuronx==2.7.0.2.8.6734+ac864f72 \
neuronx-distributed==0.11.0 \ neuronx-distributed==0.13.14393+b8569585 \
libneuronxla==2.2.1630.0 \ libneuronxla==2.2.4410.0+835a67fb \
--extra-index-url=https://pip.repos.neuron.amazonaws.com --extra-index-url=https://pip.repos.neuron.amazonaws.com
# Install HuggingFace packages # Install HuggingFace packages

View File

@ -11,7 +11,14 @@ def test_decode(neuron_model_config):
for do_sample in [True, False]: for do_sample in [True, False]:
mode = "sample" if do_sample else "greedy" mode = "sample" if do_sample else "greedy"
print(f"{config_name}[{mode}]") print(f"{config_name}[{mode}]")
_test_decode(config_name, generator, do_sample) generated_text = _test_decode(config_name, generator, do_sample)
if not do_sample:
expected_text = {
"llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
"qwen2": " I was sitting in my room, staring at the clock, when a knock at the door. I",
"granite": "\n\nThis opening line is from George Orwell's dystopian novel, \"1",
}[config_name]
assert generated_text == expected_text
generator.clear() generator.clear()
@ -21,7 +28,11 @@ def _test_decode(config_name, generator, do_sample):
) )
max_new_tokens = 20 max_new_tokens = 20
request = create_request( request = create_request(
id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample id=0,
inputs=input_text,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
temperature=0.9,
) )
max_length = generator.model.neuron_config.sequence_length max_length = generator.model.neuron_config.sequence_length
batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length) batch = Batch(id=0, requests=[request], size=1, max_tokens=max_length)
@ -38,18 +49,4 @@ def _test_decode(config_name, generator, do_sample):
output = generations[0].generated_text output = generations[0].generated_text
assert output.generated_tokens == max_new_tokens assert output.generated_tokens == max_new_tokens
assert output.finish_reason == 0 assert output.finish_reason == 0
if do_sample: return output.text
expected_text = {
"llama": " I sat alone in the café",
"qwen2": " The air was so still",
"granite": "1984, George Orwell",
}[config_name]
assert expected_text in output.text
else:
print(output.text)
expected_text = {
"llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
"qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
"granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
}[config_name]
assert output.text == expected_text

View File

@ -44,23 +44,17 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
# because of static batching # because of static batching
assert next_batch.max_tokens == batch_size * max_length assert next_batch.max_tokens == batch_size * max_length
assert len(generations) == batch_size assert len(generations) == batch_size
if do_sample: expectations = {
expectations = { "llama": [578, " The"],
"llama": [358, " I"], "qwen2": [358, " I"],
"qwen2": [576, " The"], "granite": [203, "\n"],
"granite": [308, " ("], }[config_name]
}[config_name] # Greedy mode should always generate the same output
else: if not do_sample:
expectations = { for g in generations:
"llama": [578, " The"], tokens = g.tokens
"qwen2": [358, " I"], assert tokens.ids[0] == expectations[0]
"granite": [203, "\n"], assert tokens.texts[0] == expectations[1]
}[config_name]
for g in generations:
tokens = g.tokens
assert tokens.ids[0] == expectations[0]
assert tokens.texts[0] == expectations[1]
def test_prefill_truncate(neuron_model_config): def test_prefill_truncate(neuron_model_config):
config_name = neuron_model_config["name"] config_name = neuron_model_config["name"]
@ -88,8 +82,8 @@ def test_prefill_truncate(neuron_model_config):
# be different because of the truncation # be different because of the truncation
expectations = { expectations = {
"llama": [" He", "iens", "\x08", " He"], "llama": [" He", "iens", "\x08", " He"],
"qwen2": [" He", " The", " He", " He"], "qwen2": [" He", "<|endoftext|>", " ", " The"],
"granite": ["\n", "\n", " I", " He"], "granite": ["\n", "\n", "\n", "\n"],
}[config_name] }[config_name]
for i, g in enumerate(generations): for i, g in enumerate(generations):
tokens = g.tokens tokens = g.tokens

View File

@ -22,22 +22,22 @@ async def test_model_single_request(tgi_service):
greedy_expectations = { greedy_expectations = {
"llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial", "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
"qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks", "qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks",
"granite": "\n\nDeep learning is a subset of machine learning techniques based on artificial neural networks", "granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
"qwen3": " A Deep Learning is a subset of machine learning that uses neural networks with multiple layers to", "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
"phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating", "phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
} }
assert response.generated_text == greedy_expectations[service_name] assert response.generated_text == greedy_expectations[service_name]
# Greedy bounded with input # Greedy bounded with input
response = await tgi_service.client.text_generation( greedy_response = await tgi_service.client.text_generation(
"What is Deep Learning?", "What is Deep Learning?",
max_new_tokens=17, max_new_tokens=17,
return_full_text=True, return_full_text=True,
details=True, details=True,
decoder_input_details=True, decoder_input_details=True,
) )
assert response.details.generated_tokens == 17 assert greedy_response.details.generated_tokens == 17
assert response.generated_text == prompt + greedy_expectations[service_name] assert greedy_response.generated_text == prompt + greedy_expectations[service_name]
# Sampling # Sampling
response = await tgi_service.client.text_generation( response = await tgi_service.client.text_generation(
@ -52,16 +52,12 @@ async def test_model_single_request(tgi_service):
# The response must be different # The response must be different
assert not response.startswith(greedy_expectations[service_name]) assert not response.startswith(greedy_expectations[service_name])
# Sampling with stop sequence (using one of the words returned from the previous test) # Greedy with stop sequence (using one of the words returned from the previous test)
stop_sequence = response.split(" ")[-5] stop_sequence = greedy_response.generated_text.split(" ")[-5]
response = await tgi_service.client.text_generation( response = await tgi_service.client.text_generation(
"What is Deep Learning?", "What is Deep Learning?",
do_sample=True, do_sample=False,
top_k=50,
top_p=0.9,
repetition_penalty=1.2,
max_new_tokens=128, max_new_tokens=128,
seed=42,
stop_sequences=[stop_sequence], stop_sequences=[stop_sequence],
) )
assert response.endswith(stop_sequence) assert response.endswith(stop_sequence)
@ -81,8 +77,8 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load):
expectations = { expectations = {
"llama": "Deep learning is a subset of machine learning that uses artificial", "llama": "Deep learning is a subset of machine learning that uses artificial",
"qwen2": "Deep Learning is a subset of Machine Learning that involves", "qwen2": "Deep Learning is a subset of Machine Learning that involves",
"granite": "Deep learning is a subset of machine learning techniques", "granite": "Deep Learning is a subset of machine learning that is inspired by the structure and",
"qwen3": "Deep Learning is a subset of machine learning that uses neural networks", "qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
"phi3": "Deep learning is a subfield of machine learning that focuses on creating", "phi3": "Deep learning is a subfield of machine learning that focuses on creating",
} }
expected = expectations[tgi_service.client.service_name] expected = expectations[tgi_service.client.service_name]