From bf529ef476d024268b3f24a99ac2b18876597fa4 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 23 May 2025 10:13:29 +0000
Subject: [PATCH] test(neuron): update models and expectations

---
 backends/neuron/tests/fixtures/model.py      | 72 ++++----------------
 backends/neuron/tests/server/test_decode.py  | 10 +--
 backends/neuron/tests/server/test_prefill.py | 14 ++--
 3 files changed, 20 insertions(+), 76 deletions(-)
diff --git a/backends/neuron/tests/fixtures/model.py b/backends/neuron/tests/fixtures/model.py
index 435f2b06..ad41fd10 100644
--- a/backends/neuron/tests/fixtures/model.py
+++ b/backends/neuron/tests/fixtures/model.py
@@ -4,14 +4,12 @@ import subprocess
 import sys
 from tempfile import TemporaryDirectory
 
-import huggingface_hub
+import os
 import pytest
 from transformers import AutoTokenizer
 
-from optimum.neuron import NeuronModelForCausalLM
+
 from optimum.neuron.cache import synchronize_hub_cache
-from optimum.neuron.version import __sdk_version__ as sdk_version
-from optimum.neuron.version import __version__ as version
 
 
 logging.basicConfig(
@@ -21,30 +19,14 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__file__)
 
+
 OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
 
+
 # All model configurations below will be added to the neuron_model_config fixture
 MODEL_CONFIGURATIONS = {
-    "gpt2": {
-        "model_id": "gpt2",
-        "export_kwargs": {
-            "batch_size": 4,
-            "sequence_length": 1024,
-            "num_cores": 2,
-            "auto_cast_type": "fp16",
-        },
-    },
     "llama": {
-        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
-        "export_kwargs": {
-            "batch_size": 4,
-            "sequence_length": 2048,
-            "num_cores": 2,
-            "auto_cast_type": "fp16",
-        },
-    },
-    "mistral": {
-        "model_id": "optimum/mistral-1.1b-testing",
+        "model_id": "unsloth/Llama-3.2-1B-Instruct",
         "export_kwargs": {
             "batch_size": 4,
             "sequence_length": 4096,
@@ -58,7 +40,7 @@ MODEL_CONFIGURATIONS = {
             "batch_size": 4,
             "sequence_length": 4096,
             "num_cores": 2,
-            "auto_cast_type": "fp16",
+            "auto_cast_type": "bf16",
         },
     },
     "granite": {
@@ -73,12 +55,6 @@ MODEL_CONFIGURATIONS = {
 }
 
 
-def get_hub_neuron_model_id(config_name: str):
-    return (
-        f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
-    )
-
-
 def export_model(model_id, export_kwargs, neuron_model_path):
     export_command = [
         "optimum-cli",
@@ -104,57 +80,35 @@ def export_model(model_id, export_kwargs, neuron_model_path):
 def neuron_model_config(request):
     """Expose a pre-trained neuron model
 
-    The fixture first makes sure the following model artifacts are present on the hub:
-    - exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
-    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
-    If not, it will export the model and push it to the hub.
-
-    It then fetches the model locally and return a dictionary containing:
+    The fixture exports a model locally and returns a dictionary containing:
     - a configuration name,
     - the original model id,
     - the export parameters,
-    - the neuron model id,
     - the neuron model local path.
 
     For each exposed model, the local directory is maintained for the duration of the
     test session and cleaned up afterwards.
-    The hub model artifacts are never cleaned up and persist accross sessions.
-    They must be cleaned up manually when the optimum-neuron version changes.
 
     """
     config_name = request.param
     model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
     model_id = model_config["model_id"]
     export_kwargs = model_config["export_kwargs"]
-    neuron_model_id = get_hub_neuron_model_id(config_name)
     with TemporaryDirectory() as neuron_model_path:
-        hub = huggingface_hub.HfApi()
-        if hub.repo_exists(neuron_model_id):
-            logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
-            hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
-        else:
-            export_model(model_id, export_kwargs, neuron_model_path)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            tokenizer.save_pretrained(neuron_model_path)
-            del tokenizer
-            # Create the test model on the hub
-            hub.create_repo(neuron_model_id, private=True)
-            hub.upload_folder(
-                folder_path=neuron_model_path,
-                repo_id=neuron_model_id,
-                ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
-            )
-            # Make sure it is cached
-            synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        export_model(model_id, export_kwargs, neuron_model_path)
+        synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(neuron_model_path)
+        del tokenizer
         # Add dynamic parameters to the model configuration
         model_config["neuron_model_path"] = neuron_model_path
-        model_config["neuron_model_id"] = neuron_model_id
         # Also add model configuration name to allow tests to adapt their expectations
         model_config["name"] = config_name
         # Yield instead of returning to keep a reference to the temporary directory.
         # It will go out of scope and be released only once all tests needing the fixture
         # have been completed.
         logger.info(f"{config_name} ready for testing ...")
+        os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
         yield model_config
         logger.info(f"Done with {config_name}")
 
diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py
index 377cbb23..94d1e95e 100644
--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@@ -40,19 +40,15 @@ def _test_decode(config_name, generator, do_sample):
     assert output.finish_reason == 0
     if do_sample:
         expected_text = {
-            "gpt2": " The sun was set",
-            "llama": "George Orwell, 1984",
-            "mistral": "The sky was",
-            "qwen2": " A young woman with",
+            "llama": " In the stillness of the morning",
+            "qwen2": " The air was so still",
             "granite": "1984, George Orwell",
         }[config_name]
         assert expected_text in output.text
     else:
         print(output.text)
         expected_text = {
-            "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
-            "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
-            "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
+            "llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
             "qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
             "granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
         }[config_name]
diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py
index 371946d9..48fd62ba 100644
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@@ -46,17 +46,13 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
     assert len(generations) == batch_size
     if do_sample:
         expectations = {
-            "gpt2": [383, " The"],
-            "llama": [10058, " George"],
-            "mistral": [450, " The"],
-            "qwen2": [362, " A"],
+            "llama": [763, " In"],
+            "qwen2": [576, " The"],
             "granite": [308, " ("],
         }[config_name]
     else:
         expectations = {
-            "gpt2": [198, "\n"],
-            "llama": [10058, " George"],
-            "mistral": [13, "\n"],
+            "llama": [578, " The"],
             "qwen2": [358, " I"],
             "granite": [203, "\n"],
         }[config_name]
@@ -91,9 +87,7 @@ def test_prefill_truncate(neuron_model_config):
     # Even if the input text is identical for all requests, the first generated token might
     # be different because of the truncation
     expectations = {
-        "gpt2": [" He", " He", "\n", " He"],
-        "llama": [" —", " The", " He", " He"],
-        "mistral": [" He", "\n", " He", " He"],
+        "llama": [" He", " The", " He", " He"],
         "qwen2": [" He", " The", " He", " He"],
         "granite": ["\n", "\n", " I", " He"],
     }[config_name]