test(neuron): update models and expectations

2025-06-13 21:02:07 +00:00 · 2025-05-23 10:13:29 +00:00 · 2025-05-23 10:13:29 +00:00 · c8c5dcf352
commit c8c5dcf352
parent 787e28bf59
3 changed files with 20 additions and 76 deletions
--- a/backends/neuron/tests/fixtures/model.py
+++ b/backends/neuron/tests/fixtures/model.py
@ -4,14 +4,12 @@ import subprocess
 import sys
 from tempfile import TemporaryDirectory

-import huggingface_hub
+import os
 import pytest
 from transformers import AutoTokenizer

-from optimum.neuron import NeuronModelForCausalLM
+
 from optimum.neuron.cache import synchronize_hub_cache
-from optimum.neuron.version import __sdk_version__ as sdk_version
-from optimum.neuron.version import __version__ as version


 logging.basicConfig(
@ -21,30 +19,14 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__file__)

+
 OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"

+
 # All model configurations below will be added to the neuron_model_config fixture
 MODEL_CONFIGURATIONS = {
-    "gpt2": {
-        "model_id": "gpt2",
-        "export_kwargs": {
-            "batch_size": 4,
-            "sequence_length": 1024,
-            "num_cores": 2,
-            "auto_cast_type": "fp16",
-        },
-    },
    "llama": {
-        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
-        "export_kwargs": {
-            "batch_size": 4,
-            "sequence_length": 2048,
-            "num_cores": 2,
-            "auto_cast_type": "fp16",
-        },
-    },
-    "mistral": {
-        "model_id": "optimum/mistral-1.1b-testing",
+        "model_id": "unsloth/Llama-3.2-1B-Instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
@ -58,7 +40,7 @@ MODEL_CONFIGURATIONS = {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
-            "auto_cast_type": "fp16",
+            "auto_cast_type": "bf16",
        },
    },
    "granite": {
@ -73,12 +55,6 @@ MODEL_CONFIGURATIONS = {
 }


-def get_hub_neuron_model_id(config_name: str):
-    return (
-        f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
-    )
-
-
 def export_model(model_id, export_kwargs, neuron_model_path):
    export_command = [
        "optimum-cli",
@ -104,57 +80,35 @@ def export_model(model_id, export_kwargs, neuron_model_path):
 def neuron_model_config(request):
    """Expose a pre-trained neuron model

-    The fixture first makes sure the following model artifacts are present on the hub:
-    - exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
-    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
-    If not, it will export the model and push it to the hub.
-
-    It then fetches the model locally and return a dictionary containing:
+    The fixture exports a model locally and returns a dictionary containing:
    - a configuration name,
    - the original model id,
    - the export parameters,
-    - the neuron model id,
    - the neuron model local path.

    For each exposed model, the local directory is maintained for the duration of the
    test session and cleaned up afterwards.
-    The hub model artifacts are never cleaned up and persist accross sessions.
-    They must be cleaned up manually when the optimum-neuron version changes.

    """
    config_name = request.param
    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
    model_id = model_config["model_id"]
    export_kwargs = model_config["export_kwargs"]
-    neuron_model_id = get_hub_neuron_model_id(config_name)
    with TemporaryDirectory() as neuron_model_path:
-        hub = huggingface_hub.HfApi()
-        if hub.repo_exists(neuron_model_id):
-            logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
-            hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
-        else:
-            export_model(model_id, export_kwargs, neuron_model_path)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            tokenizer.save_pretrained(neuron_model_path)
-            del tokenizer
-            # Create the test model on the hub
-            hub.create_repo(neuron_model_id, private=True)
-            hub.upload_folder(
-                folder_path=neuron_model_path,
-                repo_id=neuron_model_id,
-                ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
-            )
-            # Make sure it is cached
-            synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        export_model(model_id, export_kwargs, neuron_model_path)
+        synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(neuron_model_path)
+        del tokenizer
        # Add dynamic parameters to the model configuration
        model_config["neuron_model_path"] = neuron_model_path
-        model_config["neuron_model_id"] = neuron_model_id
        # Also add model configuration name to allow tests to adapt their expectations
        model_config["name"] = config_name
        # Yield instead of returning to keep a reference to the temporary directory.
        # It will go out of scope and be released only once all tests needing the fixture
        # have been completed.
        logger.info(f"{config_name} ready for testing ...")
+        os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
        yield model_config
        logger.info(f"Done with {config_name}")

--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@ -40,19 +40,15 @@ def _test_decode(config_name, generator, do_sample):
    assert output.finish_reason == 0
    if do_sample:
        expected_text = {
-            "gpt2": " The sun was set",
-            "llama": "George Orwell, 1984",
-            "mistral": "The sky was",
-            "qwen2": " A young woman with",
+            "llama": " In the stillness of the morning",
+            "qwen2": " The air was so still",
            "granite": "1984, George Orwell",
        }[config_name]
        assert expected_text in output.text
    else:
        print(output.text)
        expected_text = {
-            "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
-            "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
-            "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
+            "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
            "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
            "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
        }[config_name]
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@ -46,17 +46,13 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
    assert len(generations) == batch_size
    if do_sample:
        expectations = {
-            "gpt2": [383, " The"],
-            "llama": [10058, " George"],
-            "mistral": [450, " The"],
-            "qwen2": [362, " A"],
+            "llama": [763, " In"],
+            "qwen2": [576, " The"],
            "granite": [308, " ("],
        }[config_name]
    else:
        expectations = {
-            "gpt2": [198, "\n"],
-            "llama": [10058, " George"],
-            "mistral": [13, "\n"],
+            "llama": [578, " The"],
            "qwen2": [358, " I"],
            "granite": [203, "\n"],
        }[config_name]
@ -91,9 +87,7 @@ def test_prefill_truncate(neuron_model_config):
    # Even if the input text is identical for all requests, the first generated token might
    # be different because of the truncation
    expectations = {
-        "gpt2": [" He", " He", "\n", " He"],
-        "llama": [" —", " The", " He", " He"],
-        "mistral": [" He", "\n", " He", " He"],
+        "llama": [" He", " The", " He", " He"],
        "qwen2": [" He", " The", " He", " He"],
        "granite": ["\n", "\n", " I", " He"],
    }[config_name]