diff --git a/backends/neuron/tests/server/test_decode.py b/backends/neuron/tests/server/test_decode.py index 94d1e95e..b864e3ec 100644 --- a/backends/neuron/tests/server/test_decode.py +++ b/backends/neuron/tests/server/test_decode.py @@ -40,7 +40,7 @@ def _test_decode(config_name, generator, do_sample): assert output.finish_reason == 0 if do_sample: expected_text = { - "llama": " In the stillness of the morning", + "llama": " I sat alone in the café", "qwen2": " The air was so still", "granite": "1984, George Orwell", }[config_name] diff --git a/backends/neuron/tests/server/test_prefill.py b/backends/neuron/tests/server/test_prefill.py index 48fd62ba..c9ecd1c8 100644 --- a/backends/neuron/tests/server/test_prefill.py +++ b/backends/neuron/tests/server/test_prefill.py @@ -46,7 +46,7 @@ def _test_prefill(config_name, generator, batch_size, do_sample): assert len(generations) == batch_size if do_sample: expectations = { - "llama": [763, " In"], + "llama": [358, " I"], "qwen2": [576, " The"], "granite": [308, " ("], }[config_name] @@ -87,10 +87,12 @@ def test_prefill_truncate(neuron_model_config): # Even if the input text is identical for all requests, the first generated token might # be different because of the truncation expectations = { - "llama": [" He", " The", " He", " He"], + "llama": [" He", "iens", "\x08", " He"], "qwen2": [" He", " The", " He", " He"], "granite": ["\n", "\n", " I", " He"], }[config_name] for i, g in enumerate(generations): tokens = g.tokens - assert tokens.texts[0] == expectations[i] + assert ( + tokens.texts[0] == expectations[i] + ), f"Request {i} expected [{expectations[i]}], got [{tokens.texts[0]}]" diff --git a/integration-tests/neuron/test_generate.py b/integration-tests/neuron/test_generate.py index 555b4eaa..9108ce0e 100644 --- a/integration-tests/neuron/test_generate.py +++ b/integration-tests/neuron/test_generate.py @@ -20,7 +20,7 @@ async def test_model_single_request(tgi_service): ) assert response.details.generated_tokens == 17 greedy_expectations = { - "llama": " and How Does it Work?\nDeep learning is a subset of machine learning that uses artificial", + "llama": " and how does it work?\nDeep learning is a subset of machine learning that uses artificial", "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on", "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art", }