test(neuron): update models and expectations

2025-06-18 07:12:10 +00:00 · 2025-05-23 10:13:29 +00:00 · 2025-05-23 10:13:29 +00:00 · bf529ef476
commit bf529ef476
parent 4e8ffec8ef
3 changed files with 20 additions and 76 deletions
--- a/backends/neuron/tests/fixtures/model.py
+++ b/backends/neuron/tests/fixtures/model.py
@ -4,14 +4,12 @@ import subprocess
 import sys
 from tempfile import TemporaryDirectory
-import huggingface_hub
+import os
 import pytest
 from transformers import AutoTokenizer
-from optimum.neuron import NeuronModelForCausalLM
+
 from optimum.neuron.cache import synchronize_hub_cache
 from optimum.neuron.version import __sdk_version__ as sdk_version
 from optimum.neuron.version import __version__ as version
 logging.basicConfig(
@ -21,30 +19,14 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__file__)
 OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
 # All model configurations below will be added to the neuron_model_config fixture
 MODEL_CONFIGURATIONS = {
    "gpt2": {
        "model_id": "gpt2",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 1024,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "llama": {
-        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
+        "model_id": "unsloth/Llama-3.2-1B-Instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 2048,
            "num_cores": 2,
            "auto_cast_type": "fp16",
        },
    },
    "mistral": {
        "model_id": "optimum/mistral-1.1b-testing",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
@ -58,7 +40,7 @@ MODEL_CONFIGURATIONS = {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
-            "auto_cast_type": "fp16",
+            "auto_cast_type": "bf16",
        },
    },
    "granite": {
@ -73,12 +55,6 @@ MODEL_CONFIGURATIONS = {
 }
 def get_hub_neuron_model_id(config_name: str):
    return (
        f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
    )
 def export_model(model_id, export_kwargs, neuron_model_path):
    export_command = [
        "optimum-cli",
@ -104,57 +80,35 @@ def export_model(model_id, export_kwargs, neuron_model_path):
 def neuron_model_config(request):
    """Expose a pre-trained neuron model
-    The fixture first makes sure the following model artifacts are present on the hub:
+    The fixture exports a model locally and returns a dictionary containing:
    - exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
    If not, it will export the model and push it to the hub.
    It then fetches the model locally and return a dictionary containing:
    - a configuration name,
    - the original model id,
    - the export parameters,
    - the neuron model id,
    - the neuron model local path.
    For each exposed model, the local directory is maintained for the duration of the
    test session and cleaned up afterwards.
    The hub model artifacts are never cleaned up and persist accross sessions.
    They must be cleaned up manually when the optimum-neuron version changes.
    """
    config_name = request.param
    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
    model_id = model_config["model_id"]
    export_kwargs = model_config["export_kwargs"]
    neuron_model_id = get_hub_neuron_model_id(config_name)
    with TemporaryDirectory() as neuron_model_path:
-        hub = huggingface_hub.HfApi()
+        export_model(model_id, export_kwargs, neuron_model_path)
-        if hub.repo_exists(neuron_model_id):
+        synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
-            logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
-            hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        tokenizer.save_pretrained(neuron_model_path)
-        else:
+        del tokenizer
            export_model(model_id, export_kwargs, neuron_model_path)
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            tokenizer.save_pretrained(neuron_model_path)
            del tokenizer
            # Create the test model on the hub
            hub.create_repo(neuron_model_id, private=True)
            hub.upload_folder(
                folder_path=neuron_model_path,
                repo_id=neuron_model_id,
                ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
            )
            # Make sure it is cached
            synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
        # Add dynamic parameters to the model configuration
        model_config["neuron_model_path"] = neuron_model_path
        model_config["neuron_model_id"] = neuron_model_id
        # Also add model configuration name to allow tests to adapt their expectations
        model_config["name"] = config_name
        # Yield instead of returning to keep a reference to the temporary directory.
        # It will go out of scope and be released only once all tests needing the fixture
        # have been completed.
        logger.info(f"{config_name} ready for testing ...")
        os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
        yield model_config
        logger.info(f"Done with {config_name}")
--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@ -40,19 +40,15 @@ def _test_decode(config_name, generator, do_sample):
    assert output.finish_reason == 0
    if do_sample:
        expected_text = {
-            "gpt2": " The sun was set",
+            "llama": " In the stillness of the morning",
-            "llama": "George Orwell, 1984",
+            "qwen2": " The air was so still",
            "mistral": "The sky was",
            "qwen2": " A young woman with",
            "granite": "1984, George Orwell",
        }[config_name]
        assert expected_text in output.text
    else:
        print(output.text)
        expected_text = {
-            "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
+            "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
            "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
            "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
            "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
            "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
        }[config_name]
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@ -46,17 +46,13 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
    assert len(generations) == batch_size
    if do_sample:
        expectations = {
-            "gpt2": [383, " The"],
+            "llama": [763, " In"],
-            "llama": [10058, " George"],
+            "qwen2": [576, " The"],
            "mistral": [450, " The"],
            "qwen2": [362, " A"],
            "granite": [308, " ("],
        }[config_name]
    else:
        expectations = {
-            "gpt2": [198, "\n"],
+            "llama": [578, " The"],
            "llama": [10058, " George"],
            "mistral": [13, "\n"],
            "qwen2": [358, " I"],
            "granite": [203, "\n"],
        }[config_name]
@ -91,9 +87,7 @@ def test_prefill_truncate(neuron_model_config):
    # Even if the input text is identical for all requests, the first generated token might
    # be different because of the truncation
    expectations = {
-        "gpt2": [" He", " He", "\n", " He"],
+        "llama": [" He", " The", " He", " He"],
        "llama": [" —", " The", " He", " He"],
        "mistral": [" He", "\n", " He", " He"],
        "qwen2": [" He", " The", " He", " He"],
        "granite": ["\n", "\n", " I", " He"],
    }[config_name]