From 0295bf243f38341185b597e7c613d6283f0d7c5d Mon Sep 17 00:00:00 2001 From: baptiste Date: Mon, 23 Jun 2025 12:10:14 +0000 Subject: [PATCH] fix broken test --- backends/gaudi/Makefile | 2 - backends/gaudi/README.md | 5 + integration-tests/fixtures/gaudi/service.py | 49 ++-- .../gaudi/test_gaudi_generate.py | 22 +- integration-tests/gaudi/test_model.py | 259 ------------------ 5 files changed, 52 insertions(+), 285 deletions(-) delete mode 100644 integration-tests/gaudi/test_model.py diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile index 2eb5506f..40d17f61 100644 --- a/backends/gaudi/Makefile +++ b/backends/gaudi/Makefile @@ -50,8 +50,6 @@ local-dev-install: install-dependencies # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image) run-integration-tests: - pip install -U pip uv - uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt DOCKER_VOLUME=${root_dir}/data \ HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \ pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md index fa68c0a9..7713040f 100644 --- a/backends/gaudi/README.md +++ b/backends/gaudi/README.md @@ -99,6 +99,11 @@ curl 127.0.0.1:8080/generate \ ### Integration tests +Install the dependencies: +```bash +pip install -r integration-tests/requirements.txt +``` + To run the integration tests, you need to first build the image: ```bash make -C backends/gaudi image diff --git a/integration-tests/fixtures/gaudi/service.py b/integration-tests/fixtures/gaudi/service.py index b6942dbe..5c7d729b 100644 --- a/integration-tests/fixtures/gaudi/service.py +++ b/integration-tests/fixtures/gaudi/service.py @@ -16,8 +16,7 @@ from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from docker.errors import NotFound import logging from gaudi.test_gaudi_generate import TEST_CONFIGS -from text_generation import AsyncClient -from text_generation.types import Response +from huggingface_hub import AsyncInferenceClient, TextGenerationOutput import huggingface_hub logging.basicConfig( @@ -71,9 +70,15 @@ def stream_container_logs(container, test_name): logger.error(f"Error streaming container logs: {str(e)}") +class TestClient(AsyncInferenceClient): + def __init__(self, service_name: str, base_url: str): + super().__init__(model=base_url) + self.service_name = service_name + + class LauncherHandle: - def __init__(self, port: int): - self.client = AsyncClient(f"http://localhost:{port}", timeout=3600) + def __init__(self, service_name: str, port: int): + self.client = TestClient(service_name, f"http://localhost:{port}") def _inner_health(self): raise NotImplementedError @@ -89,7 +94,7 @@ class LauncherHandle: raise RuntimeError("Launcher crashed") try: - await self.client.generate("test") + await self.client.text_generation("test", max_new_tokens=1) elapsed = time.time() - start_time logger.info(f"Health check passed after {elapsed:.1f}s") return @@ -113,7 +118,8 @@ class LauncherHandle: class ContainerLauncherHandle(LauncherHandle): def __init__(self, docker_client, container_name, port: int): - super(ContainerLauncherHandle, self).__init__(port) + service_name = container_name # Use container name as service name + super(ContainerLauncherHandle, self).__init__(service_name, port) self.docker_client = docker_client self.container_name = container_name @@ -134,7 +140,8 @@ class ContainerLauncherHandle(LauncherHandle): class ProcessLauncherHandle(LauncherHandle): def __init__(self, process, port: int): - super(ProcessLauncherHandle, self).__init__(port) + service_name = "process" # Use generic name for process launcher + super(ProcessLauncherHandle, self).__init__(service_name, port) self.process = process def _inner_health(self) -> bool: @@ -153,11 +160,13 @@ def data_volume(): @pytest.fixture(scope="module") -def gaudi_launcher(event_loop): +def gaudi_launcher(): @contextlib.contextmanager def docker_launcher( model_id: str, test_name: str, + tgi_args: List[str] = None, + env_config: dict = None ): logger.info( f"Starting docker launcher for model {model_id} and test {test_name}" @@ -185,23 +194,30 @@ def gaudi_launcher(event_loop): ) container.stop() container.wait() + container.remove() + logger.info(f"Removed existing container {container_name}") except NotFound: pass except Exception as e: logger.error(f"Error handling existing container: {str(e)}") - tgi_args = TEST_CONFIGS[test_name]["args"].copy() + if tgi_args is None: + tgi_args = [] + else: + tgi_args = tgi_args.copy() env = BASE_ENV.copy() # Add model_id to env env["MODEL_ID"] = model_id - # Add env config that is definied in the fixture parameter - if "env_config" in TEST_CONFIGS[test_name]: - env.update(TEST_CONFIGS[test_name]["env_config"].copy()) + # Add env config that is defined in the fixture parameter + if env_config is not None: + env.update(env_config.copy()) - volumes = [f"{DOCKER_VOLUME}:/data"] + volumes = [] + if DOCKER_VOLUME: + volumes = [f"{DOCKER_VOLUME}:/data"] logger.debug(f"Using volume {volumes}") try: @@ -276,13 +292,14 @@ def gaudi_launcher(event_loop): @pytest.fixture(scope="module") def gaudi_generate_load(): async def generate_load_inner( - client: AsyncClient, prompt: str, max_new_tokens: int, n: int - ) -> List[Response]: + client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int + ) -> List[TextGenerationOutput]: try: futures = [ - client.generate( + client.text_generation( prompt, max_new_tokens=max_new_tokens, + details=True, decoder_input_details=True, ) for _ in range(n) diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py index c2d768d0..f5d71ab7 100644 --- a/integration-tests/gaudi/test_gaudi_generate.py +++ b/integration-tests/gaudi/test_gaudi_generate.py @@ -1,7 +1,6 @@ from typing import Any, Dict, Generator from _pytest.fixtures import SubRequest - -from text_generation import AsyncClient +from huggingface_hub import AsyncInferenceClient, TextGenerationOutput import pytest @@ -238,13 +237,18 @@ def input(test_config: Dict[str, Any]) -> str: @pytest.fixture(scope="module") -def tgi_service(gaudi_launcher, model_id: str, test_name: str): - with gaudi_launcher(model_id, test_name) as tgi_service: +def tgi_service(gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]): + with gaudi_launcher( + model_id, + test_name, + tgi_args=test_config.get("args", []), + env_config=test_config.get("env_config", {}) + ) as tgi_service: yield tgi_service @pytest.fixture(scope="module") -async def tgi_client(tgi_service) -> AsyncClient: +async def tgi_client(tgi_service) -> AsyncInferenceClient: await tgi_service.health(1000) return tgi_service.client @@ -252,12 +256,14 @@ async def tgi_client(tgi_service) -> AsyncClient: @pytest.mark.asyncio @pytest.mark.all_models async def test_model_single_request( - tgi_client: AsyncClient, expected_outputs: Dict[str, str], input: str + tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str ): # Bounded greedy decoding without input - response = await tgi_client.generate( + response = await tgi_client.text_generation( input, max_new_tokens=32, + details=True, + decoder_input_details=True, ) assert response.details.generated_tokens == 32 assert response.generated_text == expected_outputs["greedy"] @@ -266,7 +272,7 @@ async def test_model_single_request( @pytest.mark.asyncio @pytest.mark.all_models async def test_model_multiple_requests( - tgi_client: AsyncClient, + tgi_client: AsyncInferenceClient, gaudi_generate_load, expected_outputs: Dict[str, str], input: str, diff --git a/integration-tests/gaudi/test_model.py b/integration-tests/gaudi/test_model.py deleted file mode 100644 index 407bccc2..00000000 --- a/integration-tests/gaudi/test_model.py +++ /dev/null @@ -1,259 +0,0 @@ -from typing import Any, Dict - -from text_generation import AsyncClient -import pytest - -# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures -TEST_CONFIGS = { - "meta-llama/Llama-3.1-8B-Instruct-shared": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", - "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", - "args": [ - "--sharded", - "true", - "--num-shard", - "8", - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "8", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "meta-llama/Llama-3.1-8B-Instruct": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", - "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", - "env_config": {}, - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "meta-llama/Llama-2-7b-chat-hf": { - "model_id": "meta-llama/Llama-2-7b-chat-hf", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", - "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "mistralai/Mistral-7B-Instruct-v0.3": { - "model_id": "mistralai/Mistral-7B-Instruct-v0.3", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "bigcode/starcoder2-3b": { - "model_id": "bigcode/starcoder2-3b", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "google/gemma-7b-it": { - "model_id": "google/gemma-7b-it", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "Qwen/Qwen2-0.5B-Instruct": { - "model_id": "Qwen/Qwen2-0.5B-Instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", - "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "tiiuae/falcon-7b-instruct": { - "model_id": "tiiuae/falcon-7b-instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", - "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "microsoft/phi-1_5": { - "model_id": "microsoft/phi-1_5", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", - "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "openai-community/gpt2": { - "model_id": "openai-community/gpt2", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", - "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "EleutherAI/gpt-j-6b": { - "model_id": "EleutherAI/gpt-j-6b", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, -} - -print(f"Testing {len(TEST_CONFIGS)} models") - - -@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) -def test_config(request) -> Dict[str, Any]: - """Fixture that provides model configurations for testing.""" - test_config = TEST_CONFIGS[request.param] - test_config["test_name"] = request.param - return test_config - - -@pytest.fixture(scope="module") -def model_id(test_config): - yield test_config["model_id"] - - -@pytest.fixture(scope="module") -def test_name(test_config): - yield test_config["test_name"] - - -@pytest.fixture(scope="module") -def expected_outputs(test_config): - return { - "greedy": test_config["expected_greedy_output"], - # "sampling": model_config["expected_sampling_output"], - "batch": test_config["expected_batch_output"], - } - - -@pytest.fixture(scope="module") -def input(test_config): - return test_config["input"] - - -@pytest.fixture(scope="module") -def tgi_service(launcher, model_id, test_name): - with launcher(model_id, test_name) as tgi_service: - yield tgi_service - - -@pytest.fixture(scope="module") -async def tgi_client(tgi_service) -> AsyncClient: - await tgi_service.health(1000) - return tgi_service.client - - -@pytest.mark.asyncio -async def test_model_single_request( - tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str -): - # Bounded greedy decoding without input - response = await tgi_client.generate( - input, - max_new_tokens=32, - ) - assert response.details.generated_tokens == 32 - assert response.generated_text == expected_outputs["greedy"] - - -@pytest.mark.asyncio -async def test_model_multiple_requests( - tgi_client, generate_load, expected_outputs, input -): - num_requests = 4 - responses = await generate_load( - tgi_client, - input, - max_new_tokens=32, - n=num_requests, - ) - - assert len(responses) == 4 - expected = expected_outputs["batch"] - for r in responses: - assert r.details.generated_tokens == 32 - assert r.generated_text == expected