text-generation-inference/integration-tests/neuron/test_generate.py
Alvaro Moran 8801ba12cf
Optimum neuron 0.3.0 (#3308)
* chore(neuron): update to optimum-neuron 0.3.0

Dependencies were changed accordingly, because Neuron SDK was updated to
v2.24.

* test: sample is not deterministic

Also modify the temperature in decode test to avoid granite early
stopping.

* test(neuron): adjust expectations after graph changes

* test(neuron): use greedy for stop sequences

---------

Co-authored-by: David Corvoysier <david@huggingface.co>
2025-08-26 11:07:47 +02:00

88 lines
3.4 KiB
Python

import pytest
@pytest.fixture
async def tgi_service(neuron_launcher, neuron_model_config):
model_name_or_path = neuron_model_config["neuron_model_path"]
service_name = neuron_model_config["name"]
with neuron_launcher(service_name, model_name_or_path) as tgi_service:
await tgi_service.health(600)
yield tgi_service
@pytest.mark.asyncio
async def test_model_single_request(tgi_service):
service_name = tgi_service.client.service_name
prompt = "What is Deep Learning?"
# Greedy bounded without input
response = await tgi_service.client.text_generation(
prompt, max_new_tokens=17, details=True, decoder_input_details=True
)
assert response.details.generated_tokens == 17
greedy_expectations = {
"llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
"qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks",
"granite": "\n\nDeep Learning is a subset of machine learning that is inspired by the structure and",
"qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
"phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating",
}
assert response.generated_text == greedy_expectations[service_name]
# Greedy bounded with input
greedy_response = await tgi_service.client.text_generation(
"What is Deep Learning?",
max_new_tokens=17,
return_full_text=True,
details=True,
decoder_input_details=True,
)
assert greedy_response.details.generated_tokens == 17
assert greedy_response.generated_text == prompt + greedy_expectations[service_name]
# Sampling
response = await tgi_service.client.text_generation(
"What is Deep Learning?",
do_sample=True,
top_k=50,
top_p=0.9,
repetition_penalty=1.2,
max_new_tokens=128,
seed=42,
)
# The response must be different
assert not response.startswith(greedy_expectations[service_name])
# Greedy with stop sequence (using one of the words returned from the previous test)
stop_sequence = greedy_response.generated_text.split(" ")[-5]
response = await tgi_service.client.text_generation(
"What is Deep Learning?",
do_sample=False,
max_new_tokens=128,
stop_sequences=[stop_sequence],
)
assert response.endswith(stop_sequence)
@pytest.mark.asyncio
async def test_model_multiple_requests(tgi_service, neuron_generate_load):
num_requests = 4
responses = await neuron_generate_load(
tgi_service.client,
"What is Deep Learning?",
max_new_tokens=17,
n=num_requests,
)
assert len(responses) == 4
expectations = {
"llama": "Deep learning is a subset of machine learning that uses artificial",
"qwen2": "Deep Learning is a subset of Machine Learning that involves",
"granite": "Deep Learning is a subset of machine learning that is inspired by the structure and",
"qwen3": " And Why Should You Care?\n\nDeep learning is a subset of machine learning that uses neural",
"phi3": "Deep learning is a subfield of machine learning that focuses on creating",
}
expected = expectations[tgi_service.client.service_name]
for r in responses:
assert r.details.generated_tokens == 17
assert expected in r.generated_text