From bf529ef476d024268b3f24a99ac2b18876597fa4 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 23 May 2025 10:13:29 +0000 Subject: [PATCH] test(neuron): update models and expectations --- backends/neuron/tests/fixtures/model.py | 72 ++++---------------- backends/neuron/tests/server/test_decode.py | 10 +-- backends/neuron/tests/server/test_prefill.py | 14 ++-- 3 files changed, 20 insertions(+), 76 deletions(-) diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py index 435f2b06..ad41fd10 100644 --- a/backends/neuron/tests/fixtures/model.py +++ b/backends/neuron/tests/fixtures/model.py @@ -4,14 +4,12 @@ import subprocess import sys from tempfile import TemporaryDirectory -import huggingface_hub +import os import pytest from transformers import AutoTokenizer -from optimum.neuron import NeuronModelForCausalLM + from optimum.neuron.cache import synchronize_hub_cache -from optimum.neuron.version import __sdk_version__ as sdk_version -from optimum.neuron.version import __version__ as version logging.basicConfig( @@ -21,30 +19,14 @@ logging.basicConfig( ) logger = logging.getLogger(__file__) + OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache" + # All model configurations below will be added to the neuron_model_config fixture MODEL_CONFIGURATIONS = { - "gpt2": { - "model_id": "gpt2", - "export_kwargs": { - "batch_size": 4, - "sequence_length": 1024, - "num_cores": 2, - "auto_cast_type": "fp16", - }, - }, "llama": { - "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", - "export_kwargs": { - "batch_size": 4, - "sequence_length": 2048, - "num_cores": 2, - "auto_cast_type": "fp16", - }, - }, - "mistral": { - "model_id": "optimum/mistral-1.1b-testing", + "model_id": "unsloth/Llama-3.2-1B-Instruct", "export_kwargs": { "batch_size": 4, "sequence_length": 4096, @@ -58,7 +40,7 @@ MODEL_CONFIGURATIONS = { "batch_size": 4, "sequence_length": 4096, "num_cores": 2, - "auto_cast_type": "fp16", + "auto_cast_type": "bf16", }, }, "granite": { @@ -73,12 +55,6 @@ MODEL_CONFIGURATIONS = { } -def get_hub_neuron_model_id(config_name: str): - return ( - f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}" - ) - - def export_model(model_id, export_kwargs, neuron_model_path): export_command = [ "optimum-cli", @@ -104,57 +80,35 @@ def export_model(model_id, export_kwargs, neuron_model_path): def neuron_model_config(request): """Expose a pre-trained neuron model - The fixture first makes sure the following model artifacts are present on the hub: - - exported neuron model under optimum-internal-testing/neuron-testing--, - - cached artifacts under optimum-internal-testing/neuron-testing-cache. - If not, it will export the model and push it to the hub. - - It then fetches the model locally and return a dictionary containing: + The fixture exports a model locally and returns a dictionary containing: - a configuration name, - the original model id, - the export parameters, - - the neuron model id, - the neuron model local path. For each exposed model, the local directory is maintained for the duration of the test session and cleaned up afterwards. - The hub model artifacts are never cleaned up and persist accross sessions. - They must be cleaned up manually when the optimum-neuron version changes. """ config_name = request.param model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param]) model_id = model_config["model_id"] export_kwargs = model_config["export_kwargs"] - neuron_model_id = get_hub_neuron_model_id(config_name) with TemporaryDirectory() as neuron_model_path: - hub = huggingface_hub.HfApi() - if hub.repo_exists(neuron_model_id): - logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub") - hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path) - else: - export_model(model_id, export_kwargs, neuron_model_path) - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokenizer.save_pretrained(neuron_model_path) - del tokenizer - # Create the test model on the hub - hub.create_repo(neuron_model_id, private=True) - hub.upload_folder( - folder_path=neuron_model_path, - repo_id=neuron_model_id, - ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"], - ) - # Make sure it is cached - synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID) + export_model(model_id, export_kwargs, neuron_model_path) + synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(neuron_model_path) + del tokenizer # Add dynamic parameters to the model configuration model_config["neuron_model_path"] = neuron_model_path - model_config["neuron_model_id"] = neuron_model_id # Also add model configuration name to allow tests to adapt their expectations model_config["name"] = config_name # Yield instead of returning to keep a reference to the temporary directory. # It will go out of scope and be released only once all tests needing the fixture # have been completed. logger.info(f"{config_name} ready for testing ...") + os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID yield model_config logger.info(f"Done with {config_name}") diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py index 377cbb23..94d1e95e 100644 --- a/backends/neuron/tests/server/test_decode.py +++ b/backends/neuron/tests/server/test_decode.py @@ -40,19 +40,15 @@ def _test_decode(config_name, generator, do_sample): assert output.finish_reason == 0 if do_sample: expected_text = { - "gpt2": " The sun was set", - "llama": "George Orwell, 1984", - "mistral": "The sky was", - "qwen2": " A young woman with", + "llama": " In the stillness of the morning", + "qwen2": " The air was so still", "granite": "1984, George Orwell", }[config_name] assert expected_text in output.text else: print(output.text) expected_text = { - "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going', - "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story", - "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.", + "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility", "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a", "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198", }[config_name] diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py index 371946d9..48fd62ba 100644 --- a/backends/neuron/tests/server/test_prefill.py +++ b/backends/neuron/tests/server/test_prefill.py @@ -46,17 +46,13 @@ def _test_prefill(config_name, generator, batch_size, do_sample): assert len(generations) == batch_size if do_sample: expectations = { - "gpt2": [383, " The"], - "llama": [10058, " George"], - "mistral": [450, " The"], - "qwen2": [362, " A"], + "llama": [763, " In"], + "qwen2": [576, " The"], "granite": [308, " ("], }[config_name] else: expectations = { - "gpt2": [198, "\n"], - "llama": [10058, " George"], - "mistral": [13, "\n"], + "llama": [578, " The"], "qwen2": [358, " I"], "granite": [203, "\n"], }[config_name] @@ -91,9 +87,7 @@ def test_prefill_truncate(neuron_model_config): # Even if the input text is identical for all requests, the first generated token might # be different because of the truncation expectations = { - "gpt2": [" He", " He", "\n", " He"], - "llama": [" —", " The", " He", " He"], - "mistral": [" He", "\n", " He", " He"], + "llama": [" He", " The", " He", " He"], "qwen2": [" He", " The", " He", " He"], "granite": ["\n", "\n", " I", " He"], }[config_name]