text-generation-inference/backends/neuron/tests/fixtures/model.py

import copy
import logging
import subprocess
import sys
from tempfile import TemporaryDirectory

import os
import pytest
from transformers import AutoTokenizer


from optimum.neuron.cache import synchronize_hub_cache


logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
    stream=sys.stdout,
)
logger = logging.getLogger(__file__)


OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"


# All model configurations below will be added to the neuron_model_config fixture
MODEL_CONFIGURATIONS = {
    "llama": {
        "model_id": "unsloth/Llama-3.2-1B-Instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "qwen2": {
        "model_id": "Qwen/Qwen2.5-0.5B",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
    "granite": {
        "model_id": "ibm-granite/granite-3.1-2b-instruct",
        "export_kwargs": {
            "batch_size": 4,
            "sequence_length": 4096,
            "num_cores": 2,
            "auto_cast_type": "bf16",
        },
    },
}


def export_model(model_id, export_kwargs, neuron_model_path):
    export_command = [
        "optimum-cli",
        "export",
        "neuron",
        "-m",
        model_id,
        "--task",
        "text-generation",
    ]
    for kwarg, value in export_kwargs.items():
        export_command.append(f"--{kwarg}")
        export_command.append(str(value))
    export_command.append(neuron_model_path)
    logger.info(f"Exporting {model_id} with {export_kwargs}")
    try:
        subprocess.run(export_command, check=True)
    except subprocess.CalledProcessError as e:
        raise ValueError(f"Failed to export model: {e}")


@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
def neuron_model_config(request):
    """Expose a pre-trained neuron model

    The fixture exports a model locally and returns a dictionary containing:
    - a configuration name,
    - the original model id,
    - the export parameters,
    - the neuron model local path.

    For each exposed model, the local directory is maintained for the duration of the
    test session and cleaned up afterwards.

    """
    config_name = request.param
    model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
    model_id = model_config["model_id"]
    export_kwargs = model_config["export_kwargs"]
    with TemporaryDirectory() as neuron_model_path:
        export_model(model_id, export_kwargs, neuron_model_path)
        synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.save_pretrained(neuron_model_path)
        del tokenizer
        # Add dynamic parameters to the model configuration
        model_config["neuron_model_path"] = neuron_model_path
        # Also add model configuration name to allow tests to adapt their expectations
        model_config["name"] = config_name
        # Yield instead of returning to keep a reference to the temporary directory.
        # It will go out of scope and be released only once all tests needing the fixture
        # have been completed.
        logger.info(f"{config_name} ready for testing ...")
        os.environ["CUSTOM_CACHE_REPO"] = OPTIMUM_CACHE_REPO_ID
        yield model_config
        logger.info(f"Done with {config_name}")


@pytest.fixture(scope="module")
def neuron_model_path(neuron_model_config):
    yield neuron_model_config["neuron_model_path"]