From 3d2e7c8fce3351cec07edcfa9357427f835ab273 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 3 Jul 2025 07:59:25 +0200 Subject: [PATCH] Optimum neuron 0.2.2 (#3281) * chore(neuron): use optimum-neuron 0.2.1 * test(neuron): adjust expectations Since the latest optimum-neuron uses a new modeling for granite and qwen, the greedy outputs are slighly different. * test(neuron): add phi3 and qwen3 tests * chore(neuron): use optimum-neuron 0.2.2 --- Dockerfile.neuron | 2 +- .../fixtures/neuron/export_models.py | 18 ++++++++++++++++++ integration-tests/neuron/test_generate.py | 12 ++++++++---- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 6228dbb7..95d1f4e2 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -5,7 +5,7 @@ RUN mkdir -p /tgi # Fetch the optimum-neuron sources directly to avoid relying on pypi deployments FROM alpine AS optimum-neuron RUN mkdir -p /optimum-neuron -ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.0.tar.gz /optimum-neuron/sources.tar.gz +ADD https://github.com/huggingface/optimum-neuron/archive/refs/tags/v0.2.2.tar.gz /optimum-neuron/sources.tar.gz RUN tar -C /optimum-neuron -xf /optimum-neuron/sources.tar.gz --strip-components=1 # Build cargo components (adapted from TGI original Dockerfile) diff --git a/integration-tests/fixtures/neuron/export_models.py b/integration-tests/fixtures/neuron/export_models.py index d4d0f01c..beee2ba7 100644 --- a/integration-tests/fixtures/neuron/export_models.py +++ b/integration-tests/fixtures/neuron/export_models.py @@ -46,6 +46,15 @@ MODEL_CONFIGURATIONS = { "auto_cast_type": "fp16", }, }, + "qwen3": { + "model_id": "Qwen/Qwen3-1.7B", + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, + }, "granite": { "model_id": "ibm-granite/granite-3.1-2b-instruct", "export_kwargs": { @@ -55,6 +64,15 @@ MODEL_CONFIGURATIONS = { "auto_cast_type": "bf16", }, }, + "phi3": { + "model_id": "microsoft/Phi-3-mini-4k-instruct", + "export_kwargs": { + "batch_size": 4, + "sequence_length": 4096, + "num_cores": 2, + "auto_cast_type": "bf16", + }, + }, } diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py index 9108ce0e..d96aa36e 100644 --- a/integration-tests/neuron/test_generate.py +++ b/integration-tests/neuron/test_generate.py @@ -21,8 +21,10 @@ async def test_model_single_request(tgi_service): assert response.details.generated_tokens == 17 greedy_expectations = { "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial", - "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on", - "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art", + "qwen2": " - Deep Learning is a subset of Machine Learning that involves the use of artificial neural networks", + "granite": "\n\nDeep learning is a subset of machine learning techniques based on artificial neural networks", + "qwen3": " A Deep Learning is a subset of machine learning that uses neural networks with multiple layers to", + "phi3": "\n\nDeep learning is a subfield of machine learning that focuses on creating", } assert response.generated_text == greedy_expectations[service_name] @@ -78,8 +80,10 @@ async def test_model_multiple_requests(tgi_service, neuron_generate_load): assert len(responses) == 4 expectations = { "llama": "Deep learning is a subset of machine learning that uses artificial", - "qwen2": "Deep Learning is a subset of Machine Learning that is based on", - "granite": "Deep Learning is a subset of Machine Learning, which is a branch of Art", + "qwen2": "Deep Learning is a subset of Machine Learning that involves", + "granite": "Deep learning is a subset of machine learning techniques", + "qwen3": "Deep Learning is a subset of machine learning that uses neural networks", + "phi3": "Deep learning is a subfield of machine learning that focuses on creating", } expected = expectations[tgi_service.client.service_name] for r in responses: