fix(neuron): adjust test expectations for llama on nxd

2025-06-14 13:22:07 +00:00 · 2025-05-26 13:55:20 +00:00 · 2025-05-26 13:55:20 +00:00 · d5bad17ed6
commit d5bad17ed6
parent 2c8b0e37c4
3 changed files with 7 additions and 5 deletions
--- a/backends/neuron/tests/server/test_decode.py
+++ b/backends/neuron/tests/server/test_decode.py
@ -40,7 +40,7 @@ def _test_decode(config_name, generator, do_sample):
    assert output.finish_reason == 0
    if do_sample:
        expected_text = {
-            "llama": " In the stillness of the morning",
+            "llama": " I sat alone in the café",
            "qwen2": " The air was so still",
            "granite": "1984, George Orwell",
        }[config_name]
--- a/backends/neuron/tests/server/test_prefill.py
+++ b/backends/neuron/tests/server/test_prefill.py
@ -46,7 +46,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
    assert len(generations) == batch_size
    if do_sample:
        expectations = {
-            "llama": [763, " In"],
+            "llama": [358, " I"],
            "qwen2": [576, " The"],
            "granite": [308, " ("],
        }[config_name]
@ -87,10 +87,12 @@ def test_prefill_truncate(neuron_model_config):
    # Even if the input text is identical for all requests, the first generated token might
    # be different because of the truncation
    expectations = {
-        "llama": [" He", " The", " He", " He"],
+        "llama": [" He", "iens", "\x08", " He"],
        "qwen2": [" He", " The", " He", " He"],
        "granite": ["\n", "\n", " I", " He"],
    }[config_name]
    for i, g in enumerate(generations):
        tokens = g.tokens
-        assert tokens.texts[0] == expectations[i]
+        assert (
+            tokens.texts[0] == expectations[i]
+        ), f"Request {i} expected [{expectations[i]}], got [{tokens.texts[0]}]"
--- a/integration-tests/neuron/test_generate.py
+++ b/integration-tests/neuron/test_generate.py
@ -20,7 +20,7 @@ async def test_model_single_request(tgi_service):
    )
    assert response.details.generated_tokens == 17
    greedy_expectations = {
-        "llama": " and How Does it Work?\nDeep learning is a subset of machine learning that uses artificial",
+        "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial",
        "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
        "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
    }