diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index a87191c2..14c69a2b 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -129,9 +129,9 @@ jobs: export label_extension="-gaudi" export docker_volume="/mnt/cache" export docker_devices="" - export runs_on="ubuntu-latest" + export runs_on="itac-bm-emr-gaudi3-dell-8gaudi" export platform="" - export extra_pytest="" + export extra_pytest="--gaudi" export target="" esac echo $dockerfile diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile index e135f16e..40d17f61 100644 --- a/backends/gaudi/Makefile +++ b/backends/gaudi/Makefile @@ -50,11 +50,14 @@ local-dev-install: install-dependencies # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image) run-integration-tests: - pip install -U pip uv - uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt DOCKER_VOLUME=${root_dir}/data \ HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \ - uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests + pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi + +run-integration-tests-with-all-models: + DOCKER_VOLUME=${root_dir}/data \ + HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \ + pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests capture-expected-outputs-for-integration-tests: diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md index ba890f0b..7713040f 100644 --- a/backends/gaudi/README.md +++ b/backends/gaudi/README.md @@ -99,16 +99,26 @@ curl 127.0.0.1:8080/generate \ ### Integration tests +Install the dependencies: +```bash +pip install -r integration-tests/requirements.txt +``` + To run the integration tests, you need to first build the image: ```bash make -C backends/gaudi image ``` -Then run the following command to run the integration tests: +Then run the following command to run the integration tests (CI tests): ```bash make -C backends/gaudi run-integration-tests ``` +To run the integration tests with all models, you can run the following command: +```bash +make -C backends/gaudi run-integration-tests-with-all-models +``` + To capture the expected outputs for the integration tests, you can run the following command: ```bash make -C backends/gaudi capture-expected-outputs-for-integration-tests diff --git a/backends/gaudi/server/integration-tests/pytest.ini b/backends/gaudi/server/integration-tests/pytest.ini deleted file mode 100644 index 2f4c80e3..00000000 --- a/backends/gaudi/server/integration-tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -asyncio_mode = auto diff --git a/backends/gaudi/server/integration-tests/requirements.txt b/backends/gaudi/server/integration-tests/requirements.txt deleted file mode 100644 index b67d2d8c..00000000 --- a/backends/gaudi/server/integration-tests/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pytest >= 8.3.5 -pytest-asyncio >= 0.26.0 -docker >= 7.1.0 -Levenshtein >= 0.27.1 -loguru >= 0.7.3 -aiohttp >= 3.11.14 -text-generation diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index f7852441..9cc33416 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -1,4 +1,8 @@ -pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"] +pytest_plugins = [ + "fixtures.neuron.service", + "fixtures.neuron.export_models", + "fixtures.gaudi.service", +] # ruff: noqa: E402 from _pytest.fixtures import SubRequest from huggingface_hub.inference._generated.types.chat_completion import ( @@ -68,6 +72,15 @@ def pytest_addoption(parser): parser.addoption( "--neuron", action="store_true", default=False, help="run neuron tests" ) + parser.addoption( + "--gaudi", action="store_true", default=False, help="run gaudi tests" + ) + parser.addoption( + "--gaudi-all-models", + action="store_true", + default=False, + help="Run tests for all models instead of just the default subset", + ) def pytest_configure(config): @@ -84,6 +97,22 @@ def pytest_collection_modifyitems(config, items): item.add_marker(pytest.mark.skip(reason="need --release option to run")) selectors.append(skip_release) + + if config.getoption("--gaudi"): + + def skip_not_gaudi(item): + if "gaudi" not in item.keywords: + item.add_marker(pytest.mark.skip(reason="requires --gaudi to run")) + + selectors.append(skip_not_gaudi) + else: + + def skip_gaudi(item): + if "gaudi" in item.keywords: + item.add_marker(pytest.mark.skip(reason="requires --gaudi to run")) + + selectors.append(skip_gaudi) + if config.getoption("--neuron"): def skip_not_neuron(item): @@ -100,6 +129,7 @@ def pytest_collection_modifyitems(config, items): item.add_marker(pytest.mark.skip(reason="requires --neuron to run")) selectors.append(skip_neuron) + for item in items: for selector in selectors: selector(item) diff --git a/backends/gaudi/server/integration-tests/conftest.py b/integration-tests/fixtures/gaudi/service.py similarity index 82% rename from backends/gaudi/server/integration-tests/conftest.py rename to integration-tests/fixtures/gaudi/service.py index c7daf70e..f4f43691 100644 --- a/backends/gaudi/server/integration-tests/conftest.py +++ b/integration-tests/fixtures/gaudi/service.py @@ -14,15 +14,21 @@ import docker import pytest from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from docker.errors import NotFound -from loguru import logger -from test_model import TEST_CONFIGS -from text_generation import AsyncClient -from text_generation.types import Response +import logging +from huggingface_hub import AsyncInferenceClient, TextGenerationOutput +import huggingface_hub + +logging.basicConfig( + level=logging.INFO, + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", + stream=sys.stdout, +) +logger = logging.getLogger(__file__) # Use the latest image from the local docker build DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi") DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None) -HF_TOKEN = os.getenv("HF_TOKEN", None) +HF_TOKEN = huggingface_hub.get_token() assert ( HF_TOKEN is not None @@ -48,12 +54,6 @@ HABANA_RUN_ARGS = { "cap_add": ["sys_nice"], } -logger.add( - sys.stderr, - format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", - level="INFO", -) - def stream_container_logs(container, test_name): """Stream container logs in a separate thread.""" @@ -69,9 +69,15 @@ def stream_container_logs(container, test_name): logger.error(f"Error streaming container logs: {str(e)}") +class TestClient(AsyncInferenceClient): + def __init__(self, service_name: str, base_url: str): + super().__init__(model=base_url) + self.service_name = service_name + + class LauncherHandle: - def __init__(self, port: int): - self.client = AsyncClient(f"http://localhost:{port}", timeout=3600) + def __init__(self, service_name: str, port: int): + self.client = TestClient(service_name, f"http://localhost:{port}") def _inner_health(self): raise NotImplementedError @@ -87,7 +93,7 @@ class LauncherHandle: raise RuntimeError("Launcher crashed") try: - await self.client.generate("test") + await self.client.text_generation("test", max_new_tokens=1) elapsed = time.time() - start_time logger.info(f"Health check passed after {elapsed:.1f}s") return @@ -111,7 +117,8 @@ class LauncherHandle: class ContainerLauncherHandle(LauncherHandle): def __init__(self, docker_client, container_name, port: int): - super(ContainerLauncherHandle, self).__init__(port) + service_name = container_name # Use container name as service name + super(ContainerLauncherHandle, self).__init__(service_name, port) self.docker_client = docker_client self.container_name = container_name @@ -132,7 +139,8 @@ class ContainerLauncherHandle(LauncherHandle): class ProcessLauncherHandle(LauncherHandle): def __init__(self, process, port: int): - super(ProcessLauncherHandle, self).__init__(port) + service_name = "process" # Use generic name for process launcher + super(ProcessLauncherHandle, self).__init__(service_name, port) self.process = process def _inner_health(self) -> bool: @@ -151,11 +159,13 @@ def data_volume(): @pytest.fixture(scope="module") -def launcher(data_volume): +def gaudi_launcher(): @contextlib.contextmanager def docker_launcher( model_id: str, test_name: str, + tgi_args: List[str] = None, + env_config: dict = None, ): logger.info( f"Starting docker launcher for model {model_id} and test {test_name}" @@ -183,32 +193,40 @@ def launcher(data_volume): ) container.stop() container.wait() + container.remove() + logger.info(f"Removed existing container {container_name}") except NotFound: pass except Exception as e: logger.error(f"Error handling existing container: {str(e)}") - model_name = next( - name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id - ) - - tgi_args = TEST_CONFIGS[model_name]["args"].copy() + if tgi_args is None: + tgi_args = [] + else: + tgi_args = tgi_args.copy() env = BASE_ENV.copy() # Add model_id to env env["MODEL_ID"] = model_id - # Add env config that is definied in the fixture parameter - if "env_config" in TEST_CONFIGS[model_name]: - env.update(TEST_CONFIGS[model_name]["env_config"].copy()) + # Add env config that is defined in the fixture parameter + if env_config is not None: + env.update(env_config.copy()) - volumes = [f"{DOCKER_VOLUME}:/data"] + volumes = [] + if DOCKER_VOLUME: + volumes = [f"{DOCKER_VOLUME}:/data"] logger.debug(f"Using volume {volumes}") try: + logger.debug(f"Using command {tgi_args}") logger.info(f"Creating container with name {container_name}") + logger.debug(f"Using environment {env}") + logger.debug(f"Using volumes {volumes}") + logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}") + # Log equivalent docker run command for debugging, this is not actually executed container = client.containers.run( DOCKER_IMAGE, @@ -271,15 +289,16 @@ def launcher(data_volume): @pytest.fixture(scope="module") -def generate_load(): +def gaudi_generate_load(): async def generate_load_inner( - client: AsyncClient, prompt: str, max_new_tokens: int, n: int - ) -> List[Response]: + client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int + ) -> List[TextGenerationOutput]: try: futures = [ - client.generate( + client.text_generation( prompt, max_new_tokens=max_new_tokens, + details=True, decoder_input_details=True, ) for _ in range(n) diff --git a/backends/gaudi/server/integration-tests/capture_expected_outputs.py b/integration-tests/gaudi/capture_expected_outputs.py similarity index 98% rename from backends/gaudi/server/integration-tests/capture_expected_outputs.py rename to integration-tests/gaudi/capture_expected_outputs.py index 051b9d69..5a5fd179 100644 --- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py +++ b/integration-tests/gaudi/capture_expected_outputs.py @@ -3,7 +3,7 @@ import os from typing import Dict, Any, Generator import pytest -from test_model import TEST_CONFIGS +from test_gaudi_generate import TEST_CONFIGS UNKNOWN_CONFIGS = { name: config diff --git a/backends/gaudi/server/integration-tests/test_model.py b/integration-tests/gaudi/test_gaudi_generate.py similarity index 81% rename from backends/gaudi/server/integration-tests/test_model.py rename to integration-tests/gaudi/test_gaudi_generate.py index 40b27164..2b8b0c76 100644 --- a/backends/gaudi/server/integration-tests/test_model.py +++ b/integration-tests/gaudi/test_gaudi_generate.py @@ -1,10 +1,16 @@ -from typing import Any, Dict - -from text_generation import AsyncClient +from typing import Any, Dict, Generator +from _pytest.fixtures import SubRequest +from huggingface_hub import AsyncInferenceClient import pytest -from Levenshtein import distance as levenshtein_distance -# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures + +def pytest_configure(config): + config.addinivalue_line( + "markers", "gaudi_all_models: mark test to run with all models" + ) + + +# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures. TEST_CONFIGS = { "meta-llama/Llama-3.1-8B-Instruct-shared": { "model_id": "meta-llama/Llama-3.1-8B-Instruct", @@ -25,6 +31,7 @@ TEST_CONFIGS = { "--max-batch-prefill-tokens", "2048", ], + "run_by_default": True, }, "meta-llama/Llama-3.1-8B-Instruct": { "model_id": "meta-llama/Llama-3.1-8B-Instruct", @@ -42,6 +49,7 @@ TEST_CONFIGS = { "--max-batch-prefill-tokens", "2048", ], + "run_by_default": True, }, "meta-llama/Llama-2-7b-chat-hf": { "model_id": "meta-llama/Llama-2-7b-chat-hf", @@ -181,72 +189,98 @@ TEST_CONFIGS = { }, } -print(f"Testing {len(TEST_CONFIGS)} models") + +def pytest_generate_tests(metafunc): + if "test_config" in metafunc.fixturenames: + if metafunc.config.getoption("--gaudi-all-models"): + models = list(TEST_CONFIGS.keys()) + else: + models = [ + name + for name, config in TEST_CONFIGS.items() + if config.get("run_by_default", False) + ] + print(f"Testing {len(models)} models") + metafunc.parametrize("test_config", models, indirect=True) -@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) -def test_config(request) -> Dict[str, Any]: +@pytest.fixture(scope="module") +def test_config(request: SubRequest) -> Dict[str, Any]: """Fixture that provides model configurations for testing.""" - test_config = TEST_CONFIGS[request.param] - test_config["test_name"] = request.param + model_name = request.param + test_config = TEST_CONFIGS[model_name] + test_config["test_name"] = model_name return test_config @pytest.fixture(scope="module") -def model_id(test_config): +def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]: yield test_config["model_id"] @pytest.fixture(scope="module") -def test_name(test_config): +def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]: yield test_config["test_name"] @pytest.fixture(scope="module") -def expected_outputs(test_config): +def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]: return { "greedy": test_config["expected_greedy_output"], - # "sampling": model_config["expected_sampling_output"], "batch": test_config["expected_batch_output"], } @pytest.fixture(scope="module") -def input(test_config): +def input(test_config: Dict[str, Any]) -> str: return test_config["input"] @pytest.fixture(scope="module") -def tgi_service(launcher, model_id, test_name): - with launcher(model_id, test_name) as tgi_service: +def tgi_service( + gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any] +): + with gaudi_launcher( + model_id, + test_name, + tgi_args=test_config.get("args", []), + env_config=test_config.get("env_config", {}), + ) as tgi_service: yield tgi_service @pytest.fixture(scope="module") -async def tgi_client(tgi_service) -> AsyncClient: +async def tgi_client(tgi_service) -> AsyncInferenceClient: await tgi_service.health(1000) return tgi_service.client @pytest.mark.asyncio +@pytest.mark.all_models async def test_model_single_request( - tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str + tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str ): # Bounded greedy decoding without input - response = await tgi_client.generate( + response = await tgi_client.text_generation( input, max_new_tokens=32, + details=True, + decoder_input_details=True, ) assert response.details.generated_tokens == 32 assert response.generated_text == expected_outputs["greedy"] @pytest.mark.asyncio +@pytest.mark.all_models async def test_model_multiple_requests( - tgi_client, generate_load, expected_outputs, input + tgi_client: AsyncInferenceClient, + gaudi_generate_load, + expected_outputs: Dict[str, str], + input: str, ): num_requests = 4 - responses = await generate_load( + responses = await gaudi_generate_load( tgi_client, input, max_new_tokens=32, @@ -257,6 +291,4 @@ async def test_model_multiple_requests( expected = expected_outputs["batch"] for r in responses: assert r.details.generated_tokens == 32 - # Compute the similarity with the expectation using the levenshtein distance - # We should not have more than two substitutions or additions - assert levenshtein_distance(r.generated_text, expected) < 3 + assert r.generated_text == expected