diff --git a/backends/neuron/Makefile b/backends/neuron/Makefile index ec65ff40..06674971 100644 --- a/backends/neuron/Makefile +++ b/backends/neuron/Makefile @@ -33,7 +33,3 @@ install_server: test_server: install_server python -m pip install -r ${mkfile_dir}/tests/requirements.txt python -m pytest -sv ${mkfile_dir}/tests/server - -test_integration: image - python -m pip install -r ${mkfile_dir}/tests/requirements.txt - python -m pytest -sv ${mkfile_dir}/tests/integration diff --git a/backends/neuron/tests/conftest.py b/backends/neuron/tests/conftest.py index f0fc72ab..1dd20c8c 100644 --- a/backends/neuron/tests/conftest.py +++ b/backends/neuron/tests/conftest.py @@ -1 +1 @@ -pytest_plugins = ["fixtures.service", "fixtures.model"] +pytest_plugins = ["fixtures.model"] diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index 2d3ae8a2..529c614f 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -1,3 +1,4 @@ +pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"] # ruff: noqa: E402 from _pytest.fixtures import SubRequest import requests diff --git a/integration-tests/fixtures/neuron/model.py b/integration-tests/fixtures/neuron/model.py new file mode 100644 index 00000000..6fa63ce8 --- /dev/null +++ b/integration-tests/fixtures/neuron/model.py @@ -0,0 +1,129 @@ +import copy +import logging +import subprocess +import sys +from tempfile import TemporaryDirectory + +import huggingface_hub +import pytest +from transformers import AutoTokenizer + +from optimum.neuron import NeuronModelForCausalLM +from optimum.neuron.utils import synchronize_hub_cache +from optimum.neuron.version import __sdk_version__ as sdk_version +from optimum.neuron.version import __version__ as version + + +logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s", + stream=sys.stdout, +) +logger = logging.getLogger(__file__) + +OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache" + +# All model configurations below will be added to the neuron_model_config fixture +MODEL_CONFIGURATIONS = { + "gpt2": { + "model_id": "gpt2", + "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, + }, + "llama": { + "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", + "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"}, + }, + "mistral": { + "model_id": "optimum/mistral-1.1b-testing", + "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + }, + "qwen2": { + "model_id": "Qwen/Qwen2.5-0.5B", + "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, + }, + "granite": { + "model_id": "ibm-granite/granite-3.1-2b-instruct", + "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, + }, +} + + +def get_hub_neuron_model_id(config_name: str): + return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}" + + +def export_model(model_id, export_kwargs, neuron_model_path): + export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"] + for kwarg, value in export_kwargs.items(): + export_command.append(f"--{kwarg}") + export_command.append(str(value)) + export_command.append(neuron_model_path) + logger.info(f"Exporting {model_id} with {export_kwargs}") + try: + subprocess.run(export_command, check=True) + except subprocess.CalledProcessError as e: + raise ValueError(f"Failed to export model: {e}") + + +@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys()) +def neuron_model_config(request): + """Expose a pre-trained neuron model + + The fixture first makes sure the following model artifacts are present on the hub: + - exported neuron model under optimum-internal-testing/neuron-testing--, + - cached artifacts under optimum-internal-testing/neuron-testing-cache. + If not, it will export the model and push it to the hub. + + It then fetches the model locally and return a dictionary containing: + - a configuration name, + - the original model id, + - the export parameters, + - the neuron model id, + - the neuron model local path. + + For each exposed model, the local directory is maintained for the duration of the + test session and cleaned up afterwards. + The hub model artifacts are never cleaned up and persist accross sessions. + They must be cleaned up manually when the optimum-neuron version changes. + + """ + config_name = request.param + model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param]) + model_id = model_config["model_id"] + export_kwargs = model_config["export_kwargs"] + neuron_model_id = get_hub_neuron_model_id(config_name) + with TemporaryDirectory() as neuron_model_path: + hub = huggingface_hub.HfApi() + if hub.repo_exists(neuron_model_id): + logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub") + hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path) + else: + export_model(model_id, export_kwargs, neuron_model_path) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(neuron_model_path) + del tokenizer + # Create the test model on the hub + hub.create_repo(neuron_model_id, private=True) + hub.upload_folder( + folder_path=neuron_model_path, + repo_id=neuron_model_id, + ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"], + ) + # Make sure it is cached + synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID) + # Add dynamic parameters to the model configuration + model_config["neuron_model_path"] = neuron_model_path + model_config["neuron_model_id"] = neuron_model_id + # Also add model configuration name to allow tests to adapt their expectations + model_config["name"] = config_name + # Yield instead of returning to keep a reference to the temporary directory. + # It will go out of scope and be released only once all tests needing the fixture + # have been completed. + logger.info(f"{config_name} ready for testing ...") + yield model_config + logger.info(f"Done with {config_name}") + + +@pytest.fixture(scope="module") +def neuron_model_path(neuron_model_config): + yield neuron_model_config["neuron_model_path"] diff --git a/backends/neuron/tests/fixtures/service.py b/integration-tests/fixtures/neuron/service.py similarity index 91% rename from backends/neuron/tests/fixtures/service.py rename to integration-tests/fixtures/neuron/service.py index 85b0adc5..927a35af 100644 --- a/backends/neuron/tests/fixtures/service.py +++ b/integration-tests/fixtures/neuron/service.py @@ -18,9 +18,20 @@ from huggingface_hub import AsyncInferenceClient, TextGenerationOutput OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache" -DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "text-generation-inference:latest-neuron") HF_TOKEN = huggingface_hub.get_token() + +def get_tgi_docker_image(): + docker_image = os.getenv("DOCKER_IMAGE", None) + if docker_image is None: + client = docker.from_env() + images = client.images.list(filters={"reference": "text-generation-inference"}) + if not images: + raise ValueError("No text-generation-inference image found on this host to run tests.") + docker_image = images[0].tags[0] + return docker_image + + logging.basicConfig( level=logging.INFO, format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s", @@ -83,7 +94,7 @@ def event_loop(): @pytest.fixture(scope="module") -def launcher(event_loop): +def neuron_launcher(event_loop): """Utility fixture to expose a TGI service. The fixture uses a single event loop for each module, but it can create multiple @@ -130,15 +141,16 @@ def launcher(event_loop): if var in os.environ: env[var] = os.environ[var] + base_image = get_tgi_docker_image() if os.path.isdir(model_name_or_path): # Create a sub-image containing the model to workaround docker dind issues preventing # to share a volume from the container running tests - docker_tag = f"{container_name}-img" + test_image = f"{container_name}-img" logger.info( "Building image on the flight derivated from %s, tagged with %s", - DOCKER_IMAGE, - docker_tag, + base_image, + test_image, ) with tempfile.TemporaryDirectory() as context_dir: # Copy model directory to build context @@ -147,17 +159,17 @@ def launcher(event_loop): # Create Dockerfile container_model_id = f"/data/{model_name_or_path}" docker_content = f""" - FROM {DOCKER_IMAGE} + FROM {base_image} COPY model {container_model_id} """ with open(os.path.join(context_dir, "Dockerfile"), "wb") as f: f.write(docker_content.encode("utf-8")) f.flush() - image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=docker_tag) + image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image) logger.info("Successfully built image %s", image.id) logger.debug("Build logs %s", logs) else: - docker_tag = DOCKER_IMAGE + test_image = base_image image = None container_model_id = model_name_or_path @@ -167,7 +179,7 @@ def launcher(event_loop): args.append("--trust-remote-code") container = client.containers.run( - docker_tag, + test_image, command=args, name=container_name, environment=env, @@ -210,7 +222,7 @@ def launcher(event_loop): @pytest.fixture(scope="module") -def generate_load(): +def neuron_generate_load(): """A utility fixture to launch multiple asynchronous TGI requests in parallel Args: diff --git a/backends/neuron/tests/integration/test_generate.py b/integration-tests/neuron/integration/test_generate.py similarity index 93% rename from backends/neuron/tests/integration/test_generate.py rename to integration-tests/neuron/integration/test_generate.py index db716be5..c6bdcccf 100644 --- a/backends/neuron/tests/integration/test_generate.py +++ b/integration-tests/neuron/integration/test_generate.py @@ -3,10 +3,10 @@ import pytest @pytest.fixture -async def tgi_service(launcher, neuron_model_config): +async def tgi_service(neuron_launcher, neuron_model_config): model_name_or_path = neuron_model_config["neuron_model_path"] service_name = neuron_model_config["name"] - with launcher(service_name, model_name_or_path) as tgi_service: + with neuron_launcher(service_name, model_name_or_path) as tgi_service: await tgi_service.health(600) yield tgi_service @@ -71,9 +71,9 @@ async def test_model_single_request(tgi_service): @pytest.mark.asyncio -async def test_model_multiple_requests(tgi_service, generate_load): +async def test_model_multiple_requests(tgi_service, neuron_generate_load): num_requests = 4 - responses = await generate_load( + responses = await neuron_generate_load( tgi_service.client, "What is Deep Learning?", max_new_tokens=17, diff --git a/backends/neuron/tests/integration/test_implicit_env.py b/integration-tests/neuron/integration/test_implicit_env.py similarity index 94% rename from backends/neuron/tests/integration/test_implicit_env.py rename to integration-tests/neuron/integration/test_implicit_env.py index fa88ab67..df29b910 100644 --- a/backends/neuron/tests/integration/test_implicit_env.py +++ b/integration-tests/neuron/integration/test_implicit_env.py @@ -5,7 +5,7 @@ from huggingface_hub.errors import ValidationError @pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"]) -async def tgi_service(request, launcher, neuron_model_config): +async def tgi_service(request, neuron_launcher, neuron_model_config): """Expose a TGI service corresponding to a model configuration For each model configuration, the service will be started using the following @@ -31,7 +31,7 @@ async def tgi_service(request, launcher, neuron_model_config): else: model_name_or_path = neuron_model_config["neuron_model_path"] service_name = neuron_model_config["name"] - with launcher(service_name, model_name_or_path) as tgi_service: + with neuron_launcher(service_name, model_name_or_path) as tgi_service: await tgi_service.health(600) yield tgi_service