mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-06-18 07:12:10 +00:00
test(neuron): update models and expectations
This commit is contained in:
parent
4e8ffec8ef
commit
bf529ef476
72
backends/neuron/tests/fixtures/model.py
vendored
72
backends/neuron/tests/fixtures/model.py
vendored
@ -4,14 +4,12 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
import huggingface_hub
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from optimum.neuron import NeuronModelForCausalLM
|
|
||||||
from optimum.neuron.cache import synchronize_hub_cache
|
from optimum.neuron.cache import synchronize_hub_cache
|
||||||
from optimum.neuron.version import __sdk_version__ as sdk_version
|
|
||||||
from optimum.neuron.version import __version__ as version
|
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -21,30 +19,14 @@ logging.basicConfig(
|
|||||||
)
|
)
|
||||||
logger = logging.getLogger(__file__)
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
|
||||||
OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
|
OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
|
||||||
|
|
||||||
|
|
||||||
# All model configurations below will be added to the neuron_model_config fixture
|
# All model configurations below will be added to the neuron_model_config fixture
|
||||||
MODEL_CONFIGURATIONS = {
|
MODEL_CONFIGURATIONS = {
|
||||||
"gpt2": {
|
|
||||||
"model_id": "gpt2",
|
|
||||||
"export_kwargs": {
|
|
||||||
"batch_size": 4,
|
|
||||||
"sequence_length": 1024,
|
|
||||||
"num_cores": 2,
|
|
||||||
"auto_cast_type": "fp16",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"llama": {
|
"llama": {
|
||||||
"model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
|
"model_id": "unsloth/Llama-3.2-1B-Instruct",
|
||||||
"export_kwargs": {
|
|
||||||
"batch_size": 4,
|
|
||||||
"sequence_length": 2048,
|
|
||||||
"num_cores": 2,
|
|
||||||
"auto_cast_type": "fp16",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"mistral": {
|
|
||||||
"model_id": "optimum/mistral-1.1b-testing",
|
|
||||||
"export_kwargs": {
|
"export_kwargs": {
|
||||||
"batch_size": 4,
|
"batch_size": 4,
|
||||||
"sequence_length": 4096,
|
"sequence_length": 4096,
|
||||||
@ -58,7 +40,7 @@ MODEL_CONFIGURATIONS = {
|
|||||||
"batch_size": 4,
|
"batch_size": 4,
|
||||||
"sequence_length": 4096,
|
"sequence_length": 4096,
|
||||||
"num_cores": 2,
|
"num_cores": 2,
|
||||||
"auto_cast_type": "fp16",
|
"auto_cast_type": "bf16",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"granite": {
|
"granite": {
|
||||||
@ -73,12 +55,6 @@ MODEL_CONFIGURATIONS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_hub_neuron_model_id(config_name: str):
|
|
||||||
return (
|
|
||||||
f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def export_model(model_id, export_kwargs, neuron_model_path):
|
def export_model(model_id, export_kwargs, neuron_model_path):
|
||||||
export_command = [
|
export_command = [
|
||||||
"optimum-cli",
|
"optimum-cli",
|
||||||
@ -104,57 +80,35 @@ def export_model(model_id, export_kwargs, neuron_model_path):
|
|||||||
def neuron_model_config(request):
|
def neuron_model_config(request):
|
||||||
"""Expose a pre-trained neuron model
|
"""Expose a pre-trained neuron model
|
||||||
|
|
||||||
The fixture first makes sure the following model artifacts are present on the hub:
|
The fixture exports a model locally and returns a dictionary containing:
|
||||||
- exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
|
|
||||||
- cached artifacts under optimum-internal-testing/neuron-testing-cache.
|
|
||||||
If not, it will export the model and push it to the hub.
|
|
||||||
|
|
||||||
It then fetches the model locally and return a dictionary containing:
|
|
||||||
- a configuration name,
|
- a configuration name,
|
||||||
- the original model id,
|
- the original model id,
|
||||||
- the export parameters,
|
- the export parameters,
|
||||||
- the neuron model id,
|
|
||||||
- the neuron model local path.
|
- the neuron model local path.
|
||||||
|
|
||||||
For each exposed model, the local directory is maintained for the duration of the
|
For each exposed model, the local directory is maintained for the duration of the
|
||||||
test session and cleaned up afterwards.
|
test session and cleaned up afterwards.
|
||||||
The hub model artifacts are never cleaned up and persist accross sessions.
|
|
||||||
They must be cleaned up manually when the optimum-neuron version changes.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
config_name = request.param
|
config_name = request.param
|
||||||
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
|
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
|
||||||
model_id = model_config["model_id"]
|
model_id = model_config["model_id"]
|
||||||
export_kwargs = model_config["export_kwargs"]
|
export_kwargs = model_config["export_kwargs"]
|
||||||
neuron_model_id = get_hub_neuron_model_id(config_name)
|
|
||||||
with TemporaryDirectory() as neuron_model_path:
|
with TemporaryDirectory() as neuron_model_path:
|
||||||
hub = huggingface_hub.HfApi()
|
export_model(model_id, export_kwargs, neuron_model_path)
|
||||||
if hub.repo_exists(neuron_model_id):
|
synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
|
||||||
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
|
tokenizer.save_pretrained(neuron_model_path)
|
||||||
else:
|
del tokenizer
|
||||||
export_model(model_id, export_kwargs, neuron_model_path)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
||||||
tokenizer.save_pretrained(neuron_model_path)
|
|
||||||
del tokenizer
|
|
||||||
# Create the test model on the hub
|
|
||||||
hub.create_repo(neuron_model_id, private=True)
|
|
||||||
hub.upload_folder(
|
|
||||||
folder_path=neuron_model_path,
|
|
||||||
repo_id=neuron_model_id,
|
|
||||||
ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
|
|
||||||
)
|
|
||||||
# Make sure it is cached
|
|
||||||
synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
|
|
||||||
# Add dynamic parameters to the model configuration
|
# Add dynamic parameters to the model configuration
|
||||||
model_config["neuron_model_path"] = neuron_model_path
|
model_config["neuron_model_path"] = neuron_model_path
|
||||||
model_config["neuron_model_id"] = neuron_model_id
|
|
||||||
# Also add model configuration name to allow tests to adapt their expectations
|
# Also add model configuration name to allow tests to adapt their expectations
|
||||||
model_config["name"] = config_name
|
model_config["name"] = config_name
|
||||||
# Yield instead of returning to keep a reference to the temporary directory.
|
# Yield instead of returning to keep a reference to the temporary directory.
|
||||||
# It will go out of scope and be released only once all tests needing the fixture
|
# It will go out of scope and be released only once all tests needing the fixture
|
||||||
# have been completed.
|
# have been completed.
|
||||||
logger.info(f"{config_name} ready for testing ...")
|
logger.info(f"{config_name} ready for testing ...")
|
||||||
|
os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
|
||||||
yield model_config
|
yield model_config
|
||||||
logger.info(f"Done with {config_name}")
|
logger.info(f"Done with {config_name}")
|
||||||
|
|
||||||
|
@ -40,19 +40,15 @@ def _test_decode(config_name, generator, do_sample):
|
|||||||
assert output.finish_reason == 0
|
assert output.finish_reason == 0
|
||||||
if do_sample:
|
if do_sample:
|
||||||
expected_text = {
|
expected_text = {
|
||||||
"gpt2": " The sun was set",
|
"llama": " In the stillness of the morning",
|
||||||
"llama": "George Orwell, 1984",
|
"qwen2": " The air was so still",
|
||||||
"mistral": "The sky was",
|
|
||||||
"qwen2": " A young woman with",
|
|
||||||
"granite": "1984, George Orwell",
|
"granite": "1984, George Orwell",
|
||||||
}[config_name]
|
}[config_name]
|
||||||
assert expected_text in output.text
|
assert expected_text in output.text
|
||||||
else:
|
else:
|
||||||
print(output.text)
|
print(output.text)
|
||||||
expected_text = {
|
expected_text = {
|
||||||
"gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
|
"llama": " The world was holding its breath as the world's top scientists and engineers gathered at the secret underground facility",
|
||||||
"llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
|
|
||||||
"mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
|
|
||||||
"qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
|
"qwen2": " I was sitting in my room, staring at the ceiling, when the door opened and in came a",
|
||||||
"granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
|
"granite": "\n\nThis opening line from George Orwell's dystopian novel \"198",
|
||||||
}[config_name]
|
}[config_name]
|
||||||
|
@ -46,17 +46,13 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
|
|||||||
assert len(generations) == batch_size
|
assert len(generations) == batch_size
|
||||||
if do_sample:
|
if do_sample:
|
||||||
expectations = {
|
expectations = {
|
||||||
"gpt2": [383, " The"],
|
"llama": [763, " In"],
|
||||||
"llama": [10058, " George"],
|
"qwen2": [576, " The"],
|
||||||
"mistral": [450, " The"],
|
|
||||||
"qwen2": [362, " A"],
|
|
||||||
"granite": [308, " ("],
|
"granite": [308, " ("],
|
||||||
}[config_name]
|
}[config_name]
|
||||||
else:
|
else:
|
||||||
expectations = {
|
expectations = {
|
||||||
"gpt2": [198, "\n"],
|
"llama": [578, " The"],
|
||||||
"llama": [10058, " George"],
|
|
||||||
"mistral": [13, "\n"],
|
|
||||||
"qwen2": [358, " I"],
|
"qwen2": [358, " I"],
|
||||||
"granite": [203, "\n"],
|
"granite": [203, "\n"],
|
||||||
}[config_name]
|
}[config_name]
|
||||||
@ -91,9 +87,7 @@ def test_prefill_truncate(neuron_model_config):
|
|||||||
# Even if the input text is identical for all requests, the first generated token might
|
# Even if the input text is identical for all requests, the first generated token might
|
||||||
# be different because of the truncation
|
# be different because of the truncation
|
||||||
expectations = {
|
expectations = {
|
||||||
"gpt2": [" He", " He", "\n", " He"],
|
"llama": [" He", " The", " He", " He"],
|
||||||
"llama": [" —", " The", " He", " He"],
|
|
||||||
"mistral": [" He", "\n", " He", " He"],
|
|
||||||
"qwen2": [" He", " The", " He", " He"],
|
"qwen2": [" He", " The", " He", " He"],
|
||||||
"granite": ["\n", "\n", " I", " He"],
|
"granite": ["\n", "\n", " I", " He"],
|
||||||
}[config_name]
|
}[config_name]
|
||||||
|
Loading…
Reference in New Issue
Block a user