Merge ae7f3aeba1 into 719907410b

2025-06-24 10:00:16 +00:00 · 2025-06-23 12:27:39 +00:00 · 2025-06-23 12:27:39 +00:00 · 737b1d8369
commit 737b1d8369
parent 719907410b ae7f3aeba1
9 changed files with 157 additions and 72 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -129,9 +129,9 @@ jobs:
                export label_extension="-gaudi"
                export docker_volume="/mnt/cache"
                export docker_devices=""
-                export runs_on="ubuntu-latest"
+                export runs_on="itac-bm-emr-gaudi3-dell-8gaudi"
                export platform=""
-                export extra_pytest=""
+                export extra_pytest="--gaudi"
                export target=""
          esac
          echo $dockerfile
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@ -50,11 +50,14 @@ local-dev-install: install-dependencies
 # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
 run-integration-tests:
 	pip install -U pip uv
 	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
-	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
+    pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
 run-integration-tests-with-all-models:
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
 	pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models
 # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
 capture-expected-outputs-for-integration-tests:
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@ -99,16 +99,26 @@ curl 127.0.0.1:8080/generate \
 ### Integration tests
 Install the dependencies:
 ```bash
 pip install -r integration-tests/requirements.txt
 ```
 To run the integration tests, you need to first build the image:
 ```bash
 make -C backends/gaudi image
 ```
-Then run the following command to run the integration tests:
+Then run the following command to run the integration tests (CI tests):
 ```bash
 make -C backends/gaudi run-integration-tests
 ```
 To run the integration tests with all models, you can run the following command:
 ```bash
 make -C backends/gaudi run-integration-tests-with-all-models
 ```
 To capture the expected outputs for the integration tests, you can run the following command:
 ```bash
 make -C backends/gaudi capture-expected-outputs-for-integration-tests
--- a/backends/gaudi/server/integration-tests/pytest.ini
+++ b/backends/gaudi/server/integration-tests/pytest.ini
@ -1,2 +0,0 @@
 [pytest]
 asyncio_mode = auto
--- a/backends/gaudi/server/integration-tests/requirements.txt
+++ b/backends/gaudi/server/integration-tests/requirements.txt
@ -1,7 +0,0 @@
 pytest >= 8.3.5
 pytest-asyncio >= 0.26.0
 docker >= 7.1.0
 Levenshtein >= 0.27.1
 loguru >= 0.7.3
 aiohttp >= 3.11.14
 text-generation
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,4 +1,8 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
+pytest_plugins = [
    "fixtures.neuron.service",
    "fixtures.neuron.export_models",
    "fixtures.gaudi.service",
 ]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 from huggingface_hub.inference._generated.types.chat_completion import (
@ -68,6 +72,15 @@ def pytest_addoption(parser):
    parser.addoption(
        "--neuron", action="store_true", default=False, help="run neuron tests"
    )
    parser.addoption(
        "--gaudi", action="store_true", default=False, help="run gaudi tests"
    )
    parser.addoption(
        "--gaudi-all-models",
        action="store_true",
        default=False,
        help="Run tests for all models instead of just the default subset",
    )
 def pytest_configure(config):
@ -84,6 +97,22 @@ def pytest_collection_modifyitems(config, items):
                item.add_marker(pytest.mark.skip(reason="need --release option to run"))
        selectors.append(skip_release)
    if config.getoption("--gaudi"):
        def skip_not_gaudi(item):
            if "gaudi" not in item.keywords:
                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
        selectors.append(skip_not_gaudi)
    else:
        def skip_gaudi(item):
            if "gaudi" in item.keywords:
                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
        selectors.append(skip_gaudi)
    if config.getoption("--neuron"):
        def skip_not_neuron(item):
@ -100,6 +129,7 @@ def pytest_collection_modifyitems(config, items):
                item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
        selectors.append(skip_neuron)
    for item in items:
        for selector in selectors:
            selector(item)
--- a/backends/gaudi/server/integration-tests/conftest.py
+++ b/backends/gaudi/server/integration-tests/conftest.py
@ -14,15 +14,21 @@ import docker
 import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
-from loguru import logger
+import logging
-from test_model import TEST_CONFIGS
+from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
-from text_generation import AsyncClient
+import huggingface_hub
-from text_generation.types import Response
+
 logging.basicConfig(
    level=logging.INFO,
    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
    stream=sys.stdout,
 )
 logger = logging.getLogger(__file__)
 # Use the latest image from the local docker build
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
-HF_TOKEN = os.getenv("HF_TOKEN", None)
+HF_TOKEN = huggingface_hub.get_token()
 assert (
    HF_TOKEN is not None
@ -48,12 +54,6 @@ HABANA_RUN_ARGS = {
    "cap_add": ["sys_nice"],
 }
 logger.add(
    sys.stderr,
    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
    level="INFO",
 )
 def stream_container_logs(container, test_name):
    """Stream container logs in a separate thread."""
@ -69,9 +69,15 @@ def stream_container_logs(container, test_name):
        logger.error(f"Error streaming container logs: {str(e)}")
 class TestClient(AsyncInferenceClient):
    def __init__(self, service_name: str, base_url: str):
        super().__init__(model=base_url)
        self.service_name = service_name
 class LauncherHandle:
-    def __init__(self, port: int):
+    def __init__(self, service_name: str, port: int):
-        self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
+        self.client = TestClient(service_name, f"http://localhost:{port}")
    def _inner_health(self):
        raise NotImplementedError
@ -87,7 +93,7 @@ class LauncherHandle:
                raise RuntimeError("Launcher crashed")
            try:
-                await self.client.generate("test")
+                await self.client.text_generation("test", max_new_tokens=1)
                elapsed = time.time() - start_time
                logger.info(f"Health check passed after {elapsed:.1f}s")
                return
@ -111,7 +117,8 @@ class LauncherHandle:
 class ContainerLauncherHandle(LauncherHandle):
    def __init__(self, docker_client, container_name, port: int):
-        super(ContainerLauncherHandle, self).__init__(port)
+        service_name = container_name  # Use container name as service name
        super(ContainerLauncherHandle, self).__init__(service_name, port)
        self.docker_client = docker_client
        self.container_name = container_name
@ -132,7 +139,8 @@ class ContainerLauncherHandle(LauncherHandle):
 class ProcessLauncherHandle(LauncherHandle):
    def __init__(self, process, port: int):
-        super(ProcessLauncherHandle, self).__init__(port)
+        service_name = "process"  # Use generic name for process launcher
        super(ProcessLauncherHandle, self).__init__(service_name, port)
        self.process = process
    def _inner_health(self) -> bool:
@ -151,11 +159,13 @@ def data_volume():
@pytest.fixture(scope="module")
-def launcher(data_volume):
+def gaudi_launcher():
    @contextlib.contextmanager
    def docker_launcher(
        model_id: str,
        test_name: str,
        tgi_args: List[str] = None,
        env_config: dict = None,
    ):
        logger.info(
            f"Starting docker launcher for model {model_id} and test {test_name}"
@ -183,32 +193,40 @@ def launcher(data_volume):
            )
            container.stop()
            container.wait()
            container.remove()
            logger.info(f"Removed existing container {container_name}")
        except NotFound:
            pass
        except Exception as e:
            logger.error(f"Error handling existing container: {str(e)}")
-        model_name = next(
+        if tgi_args is None:
-            name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id
+            tgi_args = []
-        )
+        else:
-
+            tgi_args = tgi_args.copy()
        tgi_args = TEST_CONFIGS[model_name]["args"].copy()
        env = BASE_ENV.copy()
        # Add model_id to env
        env["MODEL_ID"] = model_id
-        # Add env config that is definied in the fixture parameter
+        # Add env config that is defined in the fixture parameter
-        if "env_config" in TEST_CONFIGS[model_name]:
+        if env_config is not None:
-            env.update(TEST_CONFIGS[model_name]["env_config"].copy())
+            env.update(env_config.copy())
-        volumes = [f"{DOCKER_VOLUME}:/data"]
+        volumes = []
        if DOCKER_VOLUME:
            volumes = [f"{DOCKER_VOLUME}:/data"]
        logger.debug(f"Using volume {volumes}")
        try:
            logger.debug(f"Using command {tgi_args}")
            logger.info(f"Creating container with name {container_name}")
            logger.debug(f"Using environment {env}")
            logger.debug(f"Using volumes {volumes}")
            logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}")
            # Log equivalent docker run command for debugging, this is not actually executed
            container = client.containers.run(
                DOCKER_IMAGE,
@ -271,15 +289,16 @@ def launcher(data_volume):
@pytest.fixture(scope="module")
-def generate_load():
+def gaudi_generate_load():
    async def generate_load_inner(
-        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
+        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
-    ) -> List[Response]:
+    ) -> List[TextGenerationOutput]:
        try:
            futures = [
-                client.generate(
+                client.text_generation(
                    prompt,
                    max_new_tokens=max_new_tokens,
                    details=True,
                    decoder_input_details=True,
                )
                for _ in range(n)
--- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py
+++ b/backends/gaudi/server/integration-tests/capture_expected_outputs.py
@ -3,7 +3,7 @@ import os
 from typing import Dict, Any, Generator
 import pytest
-from test_model import TEST_CONFIGS
+from test_gaudi_generate import TEST_CONFIGS
 UNKNOWN_CONFIGS = {
    name: config
--- a/backends/gaudi/server/integration-tests/test_model.py
+++ b/backends/gaudi/server/integration-tests/test_model.py
@ -1,10 +1,16 @@
-from typing import Any, Dict
+from typing import Any, Dict, Generator
-
+from _pytest.fixtures import SubRequest
-from text_generation import AsyncClient
+from huggingface_hub import AsyncInferenceClient
 import pytest
 from Levenshtein import distance as levenshtein_distance
-# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
+
 def pytest_configure(config):
    config.addinivalue_line(
        "markers", "gaudi_all_models: mark test to run with all models"
    )
 # The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
 TEST_CONFIGS = {
    "meta-llama/Llama-3.1-8B-Instruct-shared": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
@ -25,6 +31,7 @@ TEST_CONFIGS = {
            "--max-batch-prefill-tokens",
            "2048",
        ],
        "run_by_default": True,
    },
    "meta-llama/Llama-3.1-8B-Instruct": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
@ -42,6 +49,7 @@ TEST_CONFIGS = {
            "--max-batch-prefill-tokens",
            "2048",
        ],
        "run_by_default": True,
    },
    "meta-llama/Llama-2-7b-chat-hf": {
        "model_id": "meta-llama/Llama-2-7b-chat-hf",
@ -181,72 +189,98 @@ TEST_CONFIGS = {
    },
 }
-print(f"Testing {len(TEST_CONFIGS)} models")
+
 def pytest_generate_tests(metafunc):
    if "test_config" in metafunc.fixturenames:
        if metafunc.config.getoption("--gaudi-all-models"):
            models = list(TEST_CONFIGS.keys())
        else:
            models = [
                name
                for name, config in TEST_CONFIGS.items()
                if config.get("run_by_default", False)
            ]
        print(f"Testing {len(models)} models")
        metafunc.parametrize("test_config", models, indirect=True)
-@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
+@pytest.fixture(scope="module")
-def test_config(request) -> Dict[str, Any]:
+def test_config(request: SubRequest) -> Dict[str, Any]:
    """Fixture that provides model configurations for testing."""
-    test_config = TEST_CONFIGS[request.param]
+    model_name = request.param
-    test_config["test_name"] = request.param
+    test_config = TEST_CONFIGS[model_name]
    test_config["test_name"] = model_name
    return test_config
@pytest.fixture(scope="module")
-def model_id(test_config):
+def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
    yield test_config["model_id"]
@pytest.fixture(scope="module")
-def test_name(test_config):
+def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
    yield test_config["test_name"]
@pytest.fixture(scope="module")
-def expected_outputs(test_config):
+def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]:
    return {
        "greedy": test_config["expected_greedy_output"],
        # "sampling": model_config["expected_sampling_output"],
        "batch": test_config["expected_batch_output"],
    }
@pytest.fixture(scope="module")
-def input(test_config):
+def input(test_config: Dict[str, Any]) -> str:
    return test_config["input"]
@pytest.fixture(scope="module")
-def tgi_service(launcher, model_id, test_name):
+def tgi_service(
-    with launcher(model_id, test_name) as tgi_service:
+    gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]
 ):
    with gaudi_launcher(
        model_id,
        test_name,
        tgi_args=test_config.get("args", []),
        env_config=test_config.get("env_config", {}),
    ) as tgi_service:
        yield tgi_service
@pytest.fixture(scope="module")
-async def tgi_client(tgi_service) -> AsyncClient:
+async def tgi_client(tgi_service) -> AsyncInferenceClient:
    await tgi_service.health(1000)
    return tgi_service.client
@pytest.mark.asyncio
@pytest.mark.all_models
 async def test_model_single_request(
-    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+    tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str
 ):
    # Bounded greedy decoding without input
-    response = await tgi_client.generate(
+    response = await tgi_client.text_generation(
        input,
        max_new_tokens=32,
        details=True,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 32
    assert response.generated_text == expected_outputs["greedy"]
@pytest.mark.asyncio
@pytest.mark.all_models
 async def test_model_multiple_requests(
-    tgi_client, generate_load, expected_outputs, input
+    tgi_client: AsyncInferenceClient,
    gaudi_generate_load,
    expected_outputs: Dict[str, str],
    input: str,
 ):
    num_requests = 4
-    responses = await generate_load(
+    responses = await gaudi_generate_load(
        tgi_client,
        input,
        max_new_tokens=32,
@ -257,6 +291,4 @@ async def test_model_multiple_requests(
    expected = expected_outputs["batch"]
    for r in responses:
        assert r.details.generated_tokens == 32
-        # Compute the similarity with the expectation using the levenshtein distance
+        assert r.generated_text == expected
        # We should not have more than two substitutions or additions
        assert levenshtein_distance(r.generated_text, expected) < 3