This commit is contained in:
Baptiste Colle 2025-06-23 12:27:39 +00:00 committed by GitHub
commit 737b1d8369
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 157 additions and 72 deletions

View File

@ -129,9 +129,9 @@ jobs:
export label_extension="-gaudi" export label_extension="-gaudi"
export docker_volume="/mnt/cache" export docker_volume="/mnt/cache"
export docker_devices="" export docker_devices=""
export runs_on="ubuntu-latest" export runs_on="itac-bm-emr-gaudi3-dell-8gaudi"
export platform="" export platform=""
export extra_pytest="" export extra_pytest="--gaudi"
export target="" export target=""
esac esac
echo $dockerfile echo $dockerfile

View File

@ -50,11 +50,14 @@ local-dev-install: install-dependencies
# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image) # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
run-integration-tests: run-integration-tests:
pip install -U pip uv
uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
DOCKER_VOLUME=${root_dir}/data \ DOCKER_VOLUME=${root_dir}/data \
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \ HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
run-integration-tests-with-all-models:
DOCKER_VOLUME=${root_dir}/data \
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models
# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
capture-expected-outputs-for-integration-tests: capture-expected-outputs-for-integration-tests:

View File

@ -99,16 +99,26 @@ curl 127.0.0.1:8080/generate \
### Integration tests ### Integration tests
Install the dependencies:
```bash
pip install -r integration-tests/requirements.txt
```
To run the integration tests, you need to first build the image: To run the integration tests, you need to first build the image:
```bash ```bash
make -C backends/gaudi image make -C backends/gaudi image
``` ```
Then run the following command to run the integration tests: Then run the following command to run the integration tests (CI tests):
```bash ```bash
make -C backends/gaudi run-integration-tests make -C backends/gaudi run-integration-tests
``` ```
To run the integration tests with all models, you can run the following command:
```bash
make -C backends/gaudi run-integration-tests-with-all-models
```
To capture the expected outputs for the integration tests, you can run the following command: To capture the expected outputs for the integration tests, you can run the following command:
```bash ```bash
make -C backends/gaudi capture-expected-outputs-for-integration-tests make -C backends/gaudi capture-expected-outputs-for-integration-tests

View File

@ -1,2 +0,0 @@
[pytest]
asyncio_mode = auto

View File

@ -1,7 +0,0 @@
pytest >= 8.3.5
pytest-asyncio >= 0.26.0
docker >= 7.1.0
Levenshtein >= 0.27.1
loguru >= 0.7.3
aiohttp >= 3.11.14
text-generation

View File

@ -1,4 +1,8 @@
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"] pytest_plugins = [
"fixtures.neuron.service",
"fixtures.neuron.export_models",
"fixtures.gaudi.service",
]
# ruff: noqa: E402 # ruff: noqa: E402
from _pytest.fixtures import SubRequest from _pytest.fixtures import SubRequest
from huggingface_hub.inference._generated.types.chat_completion import ( from huggingface_hub.inference._generated.types.chat_completion import (
@ -68,6 +72,15 @@ def pytest_addoption(parser):
parser.addoption( parser.addoption(
"--neuron", action="store_true", default=False, help="run neuron tests" "--neuron", action="store_true", default=False, help="run neuron tests"
) )
parser.addoption(
"--gaudi", action="store_true", default=False, help="run gaudi tests"
)
parser.addoption(
"--gaudi-all-models",
action="store_true",
default=False,
help="Run tests for all models instead of just the default subset",
)
def pytest_configure(config): def pytest_configure(config):
@ -84,6 +97,22 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.skip(reason="need --release option to run")) item.add_marker(pytest.mark.skip(reason="need --release option to run"))
selectors.append(skip_release) selectors.append(skip_release)
if config.getoption("--gaudi"):
def skip_not_gaudi(item):
if "gaudi" not in item.keywords:
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
selectors.append(skip_not_gaudi)
else:
def skip_gaudi(item):
if "gaudi" in item.keywords:
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
selectors.append(skip_gaudi)
if config.getoption("--neuron"): if config.getoption("--neuron"):
def skip_not_neuron(item): def skip_not_neuron(item):
@ -100,6 +129,7 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.skip(reason="requires --neuron to run")) item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
selectors.append(skip_neuron) selectors.append(skip_neuron)
for item in items: for item in items:
for selector in selectors: for selector in selectors:
selector(item) selector(item)

View File

@ -14,15 +14,21 @@ import docker
import pytest import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound from docker.errors import NotFound
from loguru import logger import logging
from test_model import TEST_CONFIGS from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
from text_generation import AsyncClient import huggingface_hub
from text_generation.types import Response
logging.basicConfig(
level=logging.INFO,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
stream=sys.stdout,
)
logger = logging.getLogger(__file__)
# Use the latest image from the local docker build # Use the latest image from the local docker build
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi") DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
HF_TOKEN = os.getenv("HF_TOKEN", None) HF_TOKEN = huggingface_hub.get_token()
assert ( assert (
HF_TOKEN is not None HF_TOKEN is not None
@ -48,12 +54,6 @@ HABANA_RUN_ARGS = {
"cap_add": ["sys_nice"], "cap_add": ["sys_nice"],
} }
logger.add(
sys.stderr,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
level="INFO",
)
def stream_container_logs(container, test_name): def stream_container_logs(container, test_name):
"""Stream container logs in a separate thread.""" """Stream container logs in a separate thread."""
@ -69,9 +69,15 @@ def stream_container_logs(container, test_name):
logger.error(f"Error streaming container logs: {str(e)}") logger.error(f"Error streaming container logs: {str(e)}")
class TestClient(AsyncInferenceClient):
def __init__(self, service_name: str, base_url: str):
super().__init__(model=base_url)
self.service_name = service_name
class LauncherHandle: class LauncherHandle:
def __init__(self, port: int): def __init__(self, service_name: str, port: int):
self.client = AsyncClient(f"http://localhost:{port}", timeout=3600) self.client = TestClient(service_name, f"http://localhost:{port}")
def _inner_health(self): def _inner_health(self):
raise NotImplementedError raise NotImplementedError
@ -87,7 +93,7 @@ class LauncherHandle:
raise RuntimeError("Launcher crashed") raise RuntimeError("Launcher crashed")
try: try:
await self.client.generate("test") await self.client.text_generation("test", max_new_tokens=1)
elapsed = time.time() - start_time elapsed = time.time() - start_time
logger.info(f"Health check passed after {elapsed:.1f}s") logger.info(f"Health check passed after {elapsed:.1f}s")
return return
@ -111,7 +117,8 @@ class LauncherHandle:
class ContainerLauncherHandle(LauncherHandle): class ContainerLauncherHandle(LauncherHandle):
def __init__(self, docker_client, container_name, port: int): def __init__(self, docker_client, container_name, port: int):
super(ContainerLauncherHandle, self).__init__(port) service_name = container_name # Use container name as service name
super(ContainerLauncherHandle, self).__init__(service_name, port)
self.docker_client = docker_client self.docker_client = docker_client
self.container_name = container_name self.container_name = container_name
@ -132,7 +139,8 @@ class ContainerLauncherHandle(LauncherHandle):
class ProcessLauncherHandle(LauncherHandle): class ProcessLauncherHandle(LauncherHandle):
def __init__(self, process, port: int): def __init__(self, process, port: int):
super(ProcessLauncherHandle, self).__init__(port) service_name = "process" # Use generic name for process launcher
super(ProcessLauncherHandle, self).__init__(service_name, port)
self.process = process self.process = process
def _inner_health(self) -> bool: def _inner_health(self) -> bool:
@ -151,11 +159,13 @@ def data_volume():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def launcher(data_volume): def gaudi_launcher():
@contextlib.contextmanager @contextlib.contextmanager
def docker_launcher( def docker_launcher(
model_id: str, model_id: str,
test_name: str, test_name: str,
tgi_args: List[str] = None,
env_config: dict = None,
): ):
logger.info( logger.info(
f"Starting docker launcher for model {model_id} and test {test_name}" f"Starting docker launcher for model {model_id} and test {test_name}"
@ -183,32 +193,40 @@ def launcher(data_volume):
) )
container.stop() container.stop()
container.wait() container.wait()
container.remove()
logger.info(f"Removed existing container {container_name}")
except NotFound: except NotFound:
pass pass
except Exception as e: except Exception as e:
logger.error(f"Error handling existing container: {str(e)}") logger.error(f"Error handling existing container: {str(e)}")
model_name = next( if tgi_args is None:
name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id tgi_args = []
) else:
tgi_args = tgi_args.copy()
tgi_args = TEST_CONFIGS[model_name]["args"].copy()
env = BASE_ENV.copy() env = BASE_ENV.copy()
# Add model_id to env # Add model_id to env
env["MODEL_ID"] = model_id env["MODEL_ID"] = model_id
# Add env config that is definied in the fixture parameter # Add env config that is defined in the fixture parameter
if "env_config" in TEST_CONFIGS[model_name]: if env_config is not None:
env.update(TEST_CONFIGS[model_name]["env_config"].copy()) env.update(env_config.copy())
volumes = [f"{DOCKER_VOLUME}:/data"] volumes = []
if DOCKER_VOLUME:
volumes = [f"{DOCKER_VOLUME}:/data"]
logger.debug(f"Using volume {volumes}") logger.debug(f"Using volume {volumes}")
try: try:
logger.debug(f"Using command {tgi_args}")
logger.info(f"Creating container with name {container_name}") logger.info(f"Creating container with name {container_name}")
logger.debug(f"Using environment {env}")
logger.debug(f"Using volumes {volumes}")
logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}")
# Log equivalent docker run command for debugging, this is not actually executed # Log equivalent docker run command for debugging, this is not actually executed
container = client.containers.run( container = client.containers.run(
DOCKER_IMAGE, DOCKER_IMAGE,
@ -271,15 +289,16 @@ def launcher(data_volume):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def generate_load(): def gaudi_generate_load():
async def generate_load_inner( async def generate_load_inner(
client: AsyncClient, prompt: str, max_new_tokens: int, n: int client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
) -> List[Response]: ) -> List[TextGenerationOutput]:
try: try:
futures = [ futures = [
client.generate( client.text_generation(
prompt, prompt,
max_new_tokens=max_new_tokens, max_new_tokens=max_new_tokens,
details=True,
decoder_input_details=True, decoder_input_details=True,
) )
for _ in range(n) for _ in range(n)

View File

@ -3,7 +3,7 @@ import os
from typing import Dict, Any, Generator from typing import Dict, Any, Generator
import pytest import pytest
from test_model import TEST_CONFIGS from test_gaudi_generate import TEST_CONFIGS
UNKNOWN_CONFIGS = { UNKNOWN_CONFIGS = {
name: config name: config

View File

@ -1,10 +1,16 @@
from typing import Any, Dict from typing import Any, Dict, Generator
from _pytest.fixtures import SubRequest
from text_generation import AsyncClient from huggingface_hub import AsyncInferenceClient
import pytest import pytest
from Levenshtein import distance as levenshtein_distance
# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
def pytest_configure(config):
config.addinivalue_line(
"markers", "gaudi_all_models: mark test to run with all models"
)
# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
TEST_CONFIGS = { TEST_CONFIGS = {
"meta-llama/Llama-3.1-8B-Instruct-shared": { "meta-llama/Llama-3.1-8B-Instruct-shared": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct",
@ -25,6 +31,7 @@ TEST_CONFIGS = {
"--max-batch-prefill-tokens", "--max-batch-prefill-tokens",
"2048", "2048",
], ],
"run_by_default": True,
}, },
"meta-llama/Llama-3.1-8B-Instruct": { "meta-llama/Llama-3.1-8B-Instruct": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct",
@ -42,6 +49,7 @@ TEST_CONFIGS = {
"--max-batch-prefill-tokens", "--max-batch-prefill-tokens",
"2048", "2048",
], ],
"run_by_default": True,
}, },
"meta-llama/Llama-2-7b-chat-hf": { "meta-llama/Llama-2-7b-chat-hf": {
"model_id": "meta-llama/Llama-2-7b-chat-hf", "model_id": "meta-llama/Llama-2-7b-chat-hf",
@ -181,72 +189,98 @@ TEST_CONFIGS = {
}, },
} }
print(f"Testing {len(TEST_CONFIGS)} models")
def pytest_generate_tests(metafunc):
if "test_config" in metafunc.fixturenames:
if metafunc.config.getoption("--gaudi-all-models"):
models = list(TEST_CONFIGS.keys())
else:
models = [
name
for name, config in TEST_CONFIGS.items()
if config.get("run_by_default", False)
]
print(f"Testing {len(models)} models")
metafunc.parametrize("test_config", models, indirect=True)
@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) @pytest.fixture(scope="module")
def test_config(request) -> Dict[str, Any]: def test_config(request: SubRequest) -> Dict[str, Any]:
"""Fixture that provides model configurations for testing.""" """Fixture that provides model configurations for testing."""
test_config = TEST_CONFIGS[request.param] model_name = request.param
test_config["test_name"] = request.param test_config = TEST_CONFIGS[model_name]
test_config["test_name"] = model_name
return test_config return test_config
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def model_id(test_config): def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
yield test_config["model_id"] yield test_config["model_id"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def test_name(test_config): def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
yield test_config["test_name"] yield test_config["test_name"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def expected_outputs(test_config): def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]:
return { return {
"greedy": test_config["expected_greedy_output"], "greedy": test_config["expected_greedy_output"],
# "sampling": model_config["expected_sampling_output"],
"batch": test_config["expected_batch_output"], "batch": test_config["expected_batch_output"],
} }
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def input(test_config): def input(test_config: Dict[str, Any]) -> str:
return test_config["input"] return test_config["input"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def tgi_service(launcher, model_id, test_name): def tgi_service(
with launcher(model_id, test_name) as tgi_service: gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]
):
with gaudi_launcher(
model_id,
test_name,
tgi_args=test_config.get("args", []),
env_config=test_config.get("env_config", {}),
) as tgi_service:
yield tgi_service yield tgi_service
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
async def tgi_client(tgi_service) -> AsyncClient: async def tgi_client(tgi_service) -> AsyncInferenceClient:
await tgi_service.health(1000) await tgi_service.health(1000)
return tgi_service.client return tgi_service.client
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.all_models
async def test_model_single_request( async def test_model_single_request(
tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str
): ):
# Bounded greedy decoding without input # Bounded greedy decoding without input
response = await tgi_client.generate( response = await tgi_client.text_generation(
input, input,
max_new_tokens=32, max_new_tokens=32,
details=True,
decoder_input_details=True,
) )
assert response.details.generated_tokens == 32 assert response.details.generated_tokens == 32
assert response.generated_text == expected_outputs["greedy"] assert response.generated_text == expected_outputs["greedy"]
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.all_models
async def test_model_multiple_requests( async def test_model_multiple_requests(
tgi_client, generate_load, expected_outputs, input tgi_client: AsyncInferenceClient,
gaudi_generate_load,
expected_outputs: Dict[str, str],
input: str,
): ):
num_requests = 4 num_requests = 4
responses = await generate_load( responses = await gaudi_generate_load(
tgi_client, tgi_client,
input, input,
max_new_tokens=32, max_new_tokens=32,
@ -257,6 +291,4 @@ async def test_model_multiple_requests(
expected = expected_outputs["batch"] expected = expected_outputs["batch"]
for r in responses: for r in responses:
assert r.details.generated_tokens == 32 assert r.details.generated_tokens == 32
# Compute the similarity with the expectation using the levenshtein distance assert r.generated_text == expected
# We should not have more than two substitutions or additions
assert levenshtein_distance(r.generated_text, expected) < 3