diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index a87191c2..14c69a2b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -129,9 +129,9 @@ jobs:
export label_extension="-gaudi"
export docker_volume="/mnt/cache"
export docker_devices=""
- export runs_on="ubuntu-latest"
+ export runs_on="itac-bm-emr-gaudi3-dell-8gaudi"
export platform=""
- export extra_pytest=""
+ export extra_pytest="--gaudi"
export target=""
esac
echo $dockerfile
diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index e135f16e..40d17f61 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -50,11 +50,14 @@ local-dev-install: install-dependencies
# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
run-integration-tests:
- pip install -U pip uv
- uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
DOCKER_VOLUME=${root_dir}/data \
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
- uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
+ pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
+
+run-integration-tests-with-all-models:
+ DOCKER_VOLUME=${root_dir}/data \
+ HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+ pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models
# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
capture-expected-outputs-for-integration-tests:
diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md
index ba890f0b..7713040f 100644
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@@ -99,16 +99,26 @@ curl 127.0.0.1:8080/generate \
### Integration tests
+Install the dependencies:
+```bash
+pip install -r integration-tests/requirements.txt
+```
+
To run the integration tests, you need to first build the image:
```bash
make -C backends/gaudi image
```
-Then run the following command to run the integration tests:
+Then run the following command to run the integration tests (CI tests):
```bash
make -C backends/gaudi run-integration-tests
```
+To run the integration tests with all models, you can run the following command:
+```bash
+make -C backends/gaudi run-integration-tests-with-all-models
+```
+
To capture the expected outputs for the integration tests, you can run the following command:
```bash
make -C backends/gaudi capture-expected-outputs-for-integration-tests
diff --git a/backends/gaudi/server/integration-tests/pytest.ini b/backends/gaudi/server/integration-tests/pytest.ini
deleted file mode 100644
index 2f4c80e3..00000000
--- a/backends/gaudi/server/integration-tests/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-asyncio_mode = auto
diff --git a/backends/gaudi/server/integration-tests/requirements.txt b/backends/gaudi/server/integration-tests/requirements.txt
deleted file mode 100644
index b67d2d8c..00000000
--- a/backends/gaudi/server/integration-tests/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-pytest >= 8.3.5
-pytest-asyncio >= 0.26.0
-docker >= 7.1.0
-Levenshtein >= 0.27.1
-loguru >= 0.7.3
-aiohttp >= 3.11.14
-text-generation
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index f7852441..9cc33416 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,4 +1,8 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
+pytest_plugins = [
+ "fixtures.neuron.service",
+ "fixtures.neuron.export_models",
+ "fixtures.gaudi.service",
+]
# ruff: noqa: E402
from _pytest.fixtures import SubRequest
from huggingface_hub.inference._generated.types.chat_completion import (
@@ -68,6 +72,15 @@ def pytest_addoption(parser):
parser.addoption(
"--neuron", action="store_true", default=False, help="run neuron tests"
)
+ parser.addoption(
+ "--gaudi", action="store_true", default=False, help="run gaudi tests"
+ )
+ parser.addoption(
+ "--gaudi-all-models",
+ action="store_true",
+ default=False,
+ help="Run tests for all models instead of just the default subset",
+ )
def pytest_configure(config):
@@ -84,6 +97,22 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.skip(reason="need --release option to run"))
selectors.append(skip_release)
+
+ if config.getoption("--gaudi"):
+
+ def skip_not_gaudi(item):
+ if "gaudi" not in item.keywords:
+ item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
+ selectors.append(skip_not_gaudi)
+ else:
+
+ def skip_gaudi(item):
+ if "gaudi" in item.keywords:
+ item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
+ selectors.append(skip_gaudi)
+
if config.getoption("--neuron"):
def skip_not_neuron(item):
@@ -100,6 +129,7 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
selectors.append(skip_neuron)
+
for item in items:
for selector in selectors:
selector(item)
diff --git a/backends/gaudi/server/integration-tests/conftest.py b/integration-tests/fixtures/gaudi/service.py
similarity index 82%
rename from backends/gaudi/server/integration-tests/conftest.py
rename to integration-tests/fixtures/gaudi/service.py
index c7daf70e..f4f43691 100644
--- a/backends/gaudi/server/integration-tests/conftest.py
+++ b/integration-tests/fixtures/gaudi/service.py
@@ -14,15 +14,21 @@ import docker
import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound
-from loguru import logger
-from test_model import TEST_CONFIGS
-from text_generation import AsyncClient
-from text_generation.types import Response
+import logging
+from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
+import huggingface_hub
+
+logging.basicConfig(
+ level=logging.INFO,
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
+ stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
# Use the latest image from the local docker build
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
-HF_TOKEN = os.getenv("HF_TOKEN", None)
+HF_TOKEN = huggingface_hub.get_token()
assert (
HF_TOKEN is not None
@@ -48,12 +54,6 @@ HABANA_RUN_ARGS = {
"cap_add": ["sys_nice"],
}
-logger.add(
- sys.stderr,
- format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
- level="INFO",
-)
-
def stream_container_logs(container, test_name):
"""Stream container logs in a separate thread."""
@@ -69,9 +69,15 @@ def stream_container_logs(container, test_name):
logger.error(f"Error streaming container logs: {str(e)}")
+class TestClient(AsyncInferenceClient):
+ def __init__(self, service_name: str, base_url: str):
+ super().__init__(model=base_url)
+ self.service_name = service_name
+
+
class LauncherHandle:
- def __init__(self, port: int):
- self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
+ def __init__(self, service_name: str, port: int):
+ self.client = TestClient(service_name, f"http://localhost:{port}")
def _inner_health(self):
raise NotImplementedError
@@ -87,7 +93,7 @@ class LauncherHandle:
raise RuntimeError("Launcher crashed")
try:
- await self.client.generate("test")
+ await self.client.text_generation("test", max_new_tokens=1)
elapsed = time.time() - start_time
logger.info(f"Health check passed after {elapsed:.1f}s")
return
@@ -111,7 +117,8 @@ class LauncherHandle:
class ContainerLauncherHandle(LauncherHandle):
def __init__(self, docker_client, container_name, port: int):
- super(ContainerLauncherHandle, self).__init__(port)
+ service_name = container_name # Use container name as service name
+ super(ContainerLauncherHandle, self).__init__(service_name, port)
self.docker_client = docker_client
self.container_name = container_name
@@ -132,7 +139,8 @@ class ContainerLauncherHandle(LauncherHandle):
class ProcessLauncherHandle(LauncherHandle):
def __init__(self, process, port: int):
- super(ProcessLauncherHandle, self).__init__(port)
+ service_name = "process" # Use generic name for process launcher
+ super(ProcessLauncherHandle, self).__init__(service_name, port)
self.process = process
def _inner_health(self) -> bool:
@@ -151,11 +159,13 @@ def data_volume():
@pytest.fixture(scope="module")
-def launcher(data_volume):
+def gaudi_launcher():
@contextlib.contextmanager
def docker_launcher(
model_id: str,
test_name: str,
+ tgi_args: List[str] = None,
+ env_config: dict = None,
):
logger.info(
f"Starting docker launcher for model {model_id} and test {test_name}"
@@ -183,32 +193,40 @@ def launcher(data_volume):
)
container.stop()
container.wait()
+ container.remove()
+ logger.info(f"Removed existing container {container_name}")
except NotFound:
pass
except Exception as e:
logger.error(f"Error handling existing container: {str(e)}")
- model_name = next(
- name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id
- )
-
- tgi_args = TEST_CONFIGS[model_name]["args"].copy()
+ if tgi_args is None:
+ tgi_args = []
+ else:
+ tgi_args = tgi_args.copy()
env = BASE_ENV.copy()
# Add model_id to env
env["MODEL_ID"] = model_id
- # Add env config that is definied in the fixture parameter
- if "env_config" in TEST_CONFIGS[model_name]:
- env.update(TEST_CONFIGS[model_name]["env_config"].copy())
+ # Add env config that is defined in the fixture parameter
+ if env_config is not None:
+ env.update(env_config.copy())
- volumes = [f"{DOCKER_VOLUME}:/data"]
+ volumes = []
+ if DOCKER_VOLUME:
+ volumes = [f"{DOCKER_VOLUME}:/data"]
logger.debug(f"Using volume {volumes}")
try:
+ logger.debug(f"Using command {tgi_args}")
logger.info(f"Creating container with name {container_name}")
+ logger.debug(f"Using environment {env}")
+ logger.debug(f"Using volumes {volumes}")
+ logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}")
+
# Log equivalent docker run command for debugging, this is not actually executed
container = client.containers.run(
DOCKER_IMAGE,
@@ -271,15 +289,16 @@ def launcher(data_volume):
@pytest.fixture(scope="module")
-def generate_load():
+def gaudi_generate_load():
async def generate_load_inner(
- client: AsyncClient, prompt: str, max_new_tokens: int, n: int
- ) -> List[Response]:
+ client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
+ ) -> List[TextGenerationOutput]:
try:
futures = [
- client.generate(
+ client.text_generation(
prompt,
max_new_tokens=max_new_tokens,
+ details=True,
decoder_input_details=True,
)
for _ in range(n)
diff --git a/backends/gaudi/server/integration-tests/capture_expected_outputs.py b/integration-tests/gaudi/capture_expected_outputs.py
similarity index 98%
rename from backends/gaudi/server/integration-tests/capture_expected_outputs.py
rename to integration-tests/gaudi/capture_expected_outputs.py
index 051b9d69..5a5fd179 100644
--- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py
+++ b/integration-tests/gaudi/capture_expected_outputs.py
@@ -3,7 +3,7 @@ import os
from typing import Dict, Any, Generator
import pytest
-from test_model import TEST_CONFIGS
+from test_gaudi_generate import TEST_CONFIGS
UNKNOWN_CONFIGS = {
name: config
diff --git a/backends/gaudi/server/integration-tests/test_model.py b/integration-tests/gaudi/test_gaudi_generate.py
similarity index 81%
rename from backends/gaudi/server/integration-tests/test_model.py
rename to integration-tests/gaudi/test_gaudi_generate.py
index 40b27164..2b8b0c76 100644
--- a/backends/gaudi/server/integration-tests/test_model.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -1,10 +1,16 @@
-from typing import Any, Dict
-
-from text_generation import AsyncClient
+from typing import Any, Dict, Generator
+from _pytest.fixtures import SubRequest
+from huggingface_hub import AsyncInferenceClient
import pytest
-from Levenshtein import distance as levenshtein_distance
-# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
+
+def pytest_configure(config):
+ config.addinivalue_line(
+ "markers", "gaudi_all_models: mark test to run with all models"
+ )
+
+
+# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
TEST_CONFIGS = {
"meta-llama/Llama-3.1-8B-Instruct-shared": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
@@ -25,6 +31,7 @@ TEST_CONFIGS = {
"--max-batch-prefill-tokens",
"2048",
],
+ "run_by_default": True,
},
"meta-llama/Llama-3.1-8B-Instruct": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
@@ -42,6 +49,7 @@ TEST_CONFIGS = {
"--max-batch-prefill-tokens",
"2048",
],
+ "run_by_default": True,
},
"meta-llama/Llama-2-7b-chat-hf": {
"model_id": "meta-llama/Llama-2-7b-chat-hf",
@@ -181,72 +189,98 @@ TEST_CONFIGS = {
},
}
-print(f"Testing {len(TEST_CONFIGS)} models")
+
+def pytest_generate_tests(metafunc):
+ if "test_config" in metafunc.fixturenames:
+ if metafunc.config.getoption("--gaudi-all-models"):
+ models = list(TEST_CONFIGS.keys())
+ else:
+ models = [
+ name
+ for name, config in TEST_CONFIGS.items()
+ if config.get("run_by_default", False)
+ ]
+ print(f"Testing {len(models)} models")
+ metafunc.parametrize("test_config", models, indirect=True)
-@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
-def test_config(request) -> Dict[str, Any]:
+@pytest.fixture(scope="module")
+def test_config(request: SubRequest) -> Dict[str, Any]:
"""Fixture that provides model configurations for testing."""
- test_config = TEST_CONFIGS[request.param]
- test_config["test_name"] = request.param
+ model_name = request.param
+ test_config = TEST_CONFIGS[model_name]
+ test_config["test_name"] = model_name
return test_config
@pytest.fixture(scope="module")
-def model_id(test_config):
+def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
yield test_config["model_id"]
@pytest.fixture(scope="module")
-def test_name(test_config):
+def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
yield test_config["test_name"]
@pytest.fixture(scope="module")
-def expected_outputs(test_config):
+def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]:
return {
"greedy": test_config["expected_greedy_output"],
- # "sampling": model_config["expected_sampling_output"],
"batch": test_config["expected_batch_output"],
}
@pytest.fixture(scope="module")
-def input(test_config):
+def input(test_config: Dict[str, Any]) -> str:
return test_config["input"]
@pytest.fixture(scope="module")
-def tgi_service(launcher, model_id, test_name):
- with launcher(model_id, test_name) as tgi_service:
+def tgi_service(
+ gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]
+):
+ with gaudi_launcher(
+ model_id,
+ test_name,
+ tgi_args=test_config.get("args", []),
+ env_config=test_config.get("env_config", {}),
+ ) as tgi_service:
yield tgi_service
@pytest.fixture(scope="module")
-async def tgi_client(tgi_service) -> AsyncClient:
+async def tgi_client(tgi_service) -> AsyncInferenceClient:
await tgi_service.health(1000)
return tgi_service.client
@pytest.mark.asyncio
+@pytest.mark.all_models
async def test_model_single_request(
- tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+ tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str
):
# Bounded greedy decoding without input
- response = await tgi_client.generate(
+ response = await tgi_client.text_generation(
input,
max_new_tokens=32,
+ details=True,
+ decoder_input_details=True,
)
assert response.details.generated_tokens == 32
assert response.generated_text == expected_outputs["greedy"]
@pytest.mark.asyncio
+@pytest.mark.all_models
async def test_model_multiple_requests(
- tgi_client, generate_load, expected_outputs, input
+ tgi_client: AsyncInferenceClient,
+ gaudi_generate_load,
+ expected_outputs: Dict[str, str],
+ input: str,
):
num_requests = 4
- responses = await generate_load(
+ responses = await gaudi_generate_load(
tgi_client,
input,
max_new_tokens=32,
@@ -257,6 +291,4 @@ async def test_model_multiple_requests(
expected = expected_outputs["batch"]
for r in responses:
assert r.details.generated_tokens == 32
- # Compute the similarity with the expectation using the levenshtein distance
- # We should not have more than two substitutions or additions
- assert levenshtein_distance(r.generated_text, expected) < 3
+ assert r.generated_text == expected