diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 99f29d7eb..1fbe6698f 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -129,9 +129,9 @@ jobs: export label_extension="-gaudi" export docker_volume="/mnt/cache" export docker_devices="" - export runs_on="ubuntu-latest" + export runs_on="aws-dl1-24xlarge" export platform="" - export extra_pytest="" + export extra_pytest="--gaudi" export target="" esac echo $dockerfile diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile index f760f4d6e..cf739cf57 100644 --- a/backends/gaudi/Makefile +++ b/backends/gaudi/Makefile @@ -50,10 +50,9 @@ local-dev-install: install-dependencies # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image) run-integration-tests: - uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt DOCKER_VOLUME=${root_dir}/data \ HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \ - uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests + pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests capture-expected-outputs-for-integration-tests: diff --git a/backends/gaudi/server/integration-tests/pytest.ini b/backends/gaudi/server/integration-tests/pytest.ini deleted file mode 100644 index 2f4c80e30..000000000 --- a/backends/gaudi/server/integration-tests/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -asyncio_mode = auto diff --git a/backends/gaudi/server/integration-tests/requirements.txt b/backends/gaudi/server/integration-tests/requirements.txt deleted file mode 100644 index b67d2d8cc..000000000 --- a/backends/gaudi/server/integration-tests/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -pytest >= 8.3.5 -pytest-asyncio >= 0.26.0 -docker >= 7.1.0 -Levenshtein >= 0.27.1 -loguru >= 0.7.3 -aiohttp >= 3.11.14 -text-generation diff --git a/backends/gaudi/server/integration-tests/test_model.py b/backends/gaudi/server/integration-tests/test_model.py deleted file mode 100644 index cb2bf6a9f..000000000 --- a/backends/gaudi/server/integration-tests/test_model.py +++ /dev/null @@ -1,276 +0,0 @@ -from typing import Any, Dict - -from text_generation import AsyncClient -import pytest -from Levenshtein import distance as levenshtein_distance - -# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures -TEST_CONFIGS = { - "meta-llama/Llama-3.1-8B-Instruct-shared": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use", - "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use", - "args": [ - "--sharded", - "true", - "--num-shard", - "8", - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "8", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "meta-llama/Llama-3.1-8B-Instruct": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", - "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", - "env_config": {}, - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "meta-llama/Llama-2-7b-chat-hf": { - "model_id": "meta-llama/Llama-2-7b-chat-hf", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", - "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "mistralai/Mistral-7B-Instruct-v0.3": { - "model_id": "mistralai/Mistral-7B-Instruct-v0.3", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "bigcode/starcoder2-3b": { - "model_id": "bigcode/starcoder2-3b", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "google/gemma-7b-it": { - "model_id": "google/gemma-7b-it", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "Qwen/Qwen2-0.5B-Instruct": { - "model_id": "Qwen/Qwen2-0.5B-Instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", - "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - "--max-batch-prefill-tokens", - "2048", - ], - }, - "tiiuae/falcon-7b-instruct": { - "model_id": "tiiuae/falcon-7b-instruct", - "input": "What is Deep Learning?", - "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", - "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "microsoft/phi-1_5": { - "model_id": "microsoft/phi-1_5", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", - "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "openai-community/gpt2": { - "model_id": "openai-community/gpt2", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", - "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "facebook/opt-125m": { - "model_id": "facebook/opt-125m", - "input": "What is Deep Learning?", - "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout", - "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, - "EleutherAI/gpt-j-6b": { - "model_id": "EleutherAI/gpt-j-6b", - "input": "What is Deep Learning?", - "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", - "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", - "args": [ - "--max-input-tokens", - "512", - "--max-total-tokens", - "1024", - "--max-batch-size", - "4", - ], - }, -} - -print(f"Testing {len(TEST_CONFIGS)} models") - - -@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) -def test_config(request) -> Dict[str, Any]: - """Fixture that provides model configurations for testing.""" - test_config = TEST_CONFIGS[request.param] - test_config["test_name"] = request.param - return test_config - - -@pytest.fixture(scope="module") -def model_id(test_config): - yield test_config["model_id"] - - -@pytest.fixture(scope="module") -def test_name(test_config): - yield test_config["test_name"] - - -@pytest.fixture(scope="module") -def expected_outputs(test_config): - return { - "greedy": test_config["expected_greedy_output"], - # "sampling": model_config["expected_sampling_output"], - "batch": test_config["expected_batch_output"], - } - - -@pytest.fixture(scope="module") -def input(test_config): - return test_config["input"] - - -@pytest.fixture(scope="module") -def tgi_service(launcher, model_id, test_name): - with launcher(model_id, test_name) as tgi_service: - yield tgi_service - - -@pytest.fixture(scope="module") -async def tgi_client(tgi_service) -> AsyncClient: - await tgi_service.health(1000) - return tgi_service.client - - -@pytest.mark.asyncio -async def test_model_single_request( - tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str -): - # Bounded greedy decoding without input - response = await tgi_client.generate( - input, - max_new_tokens=32, - ) - assert response.details.generated_tokens == 32 - assert response.generated_text == expected_outputs["greedy"] - - -@pytest.mark.asyncio -async def test_model_multiple_requests( - tgi_client, generate_load, expected_outputs, input -): - num_requests = 4 - responses = await generate_load( - tgi_client, - input, - max_new_tokens=32, - n=num_requests, - ) - - assert len(responses) == 4 - expected = expected_outputs["batch"] - for r in responses: - assert r.details.generated_tokens == 32 - # Compute the similarity with the expectation using the levenshtein distance - # We should not have more than two substitutions or additions - assert levenshtein_distance(r.generated_text, expected) < 3 diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index f78524414..594ffd495 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -1,4 +1,8 @@ -pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"] +pytest_plugins = [ + "fixtures.neuron.service", + "fixtures.neuron.export_models", + "fixtures.gaudi.service", +] # ruff: noqa: E402 from _pytest.fixtures import SubRequest from huggingface_hub.inference._generated.types.chat_completion import ( @@ -47,7 +51,6 @@ from text_generation.types import ( ChatComplete, ChatCompletionChunk, ChatCompletionComplete, - Completion, Details, Grammar, InputToken, @@ -68,6 +71,9 @@ def pytest_addoption(parser): parser.addoption( "--neuron", action="store_true", default=False, help="run neuron tests" ) + parser.addoption( + "--gaudi", action="store_true", default=False, help="run gaudi tests" + ) def pytest_configure(config): @@ -84,6 +90,22 @@ def pytest_collection_modifyitems(config, items): item.add_marker(pytest.mark.skip(reason="need --release option to run")) selectors.append(skip_release) + + if config.getoption("--gaudi"): + + def skip_not_gaudi(item): + if "gaudi" not in item.keywords: + item.add_marker(pytest.mark.skip(reason="requires --gaudi to run")) + + selectors.append(skip_not_gaudi) + else: + + def skip_gaudi(item): + if "gaudi" in item.keywords: + item.add_marker(pytest.mark.skip(reason="requires --gaudi to run")) + + selectors.append(skip_gaudi) + if config.getoption("--neuron"): def skip_not_neuron(item): @@ -100,6 +122,7 @@ def pytest_collection_modifyitems(config, items): item.add_marker(pytest.mark.skip(reason="requires --neuron to run")) selectors.append(skip_neuron) + for item in items: for selector in selectors: selector(item) @@ -131,7 +154,6 @@ class ResponseComparator(JSONSnapshotExtension): or isinstance(data, ChatComplete) or isinstance(data, ChatCompletionChunk) or isinstance(data, ChatCompletionComplete) - or isinstance(data, Completion) or isinstance(data, OAIChatCompletionChunk) or isinstance(data, OAICompletion) ): @@ -188,8 +210,6 @@ class ResponseComparator(JSONSnapshotExtension): if isinstance(choices, List) and len(choices) >= 1: if "delta" in choices[0]: return ChatCompletionChunk(**data) - if "text" in choices[0]: - return Completion(**data) return ChatComplete(**data) else: return Response(**data) @@ -282,9 +302,6 @@ class ResponseComparator(JSONSnapshotExtension): ) ) - def eq_completion(response: Completion, other: Completion) -> bool: - return response.choices[0].text == other.choices[0].text - def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool: return ( response.choices[0].message.content == other.choices[0].message.content @@ -329,11 +346,6 @@ class ResponseComparator(JSONSnapshotExtension): if len(serialized_data) == 0: return len(snapshot_data) == len(serialized_data) - if isinstance(serialized_data[0], Completion): - return len(snapshot_data) == len(serialized_data) and all( - [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)] - ) - if isinstance(serialized_data[0], ChatComplete): return len(snapshot_data) == len(serialized_data) and all( [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)] diff --git a/backends/gaudi/server/integration-tests/conftest.py b/integration-tests/fixtures/gaudi/service.py similarity index 97% rename from backends/gaudi/server/integration-tests/conftest.py rename to integration-tests/fixtures/gaudi/service.py index c7daf70e0..44c7f9993 100644 --- a/backends/gaudi/server/integration-tests/conftest.py +++ b/integration-tests/fixtures/gaudi/service.py @@ -14,15 +14,23 @@ import docker import pytest from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from docker.errors import NotFound -from loguru import logger -from test_model import TEST_CONFIGS +import logging +from gaudi.test_gaudi_generate import TEST_CONFIGS from text_generation import AsyncClient from text_generation.types import Response +import huggingface_hub + +logging.basicConfig( + level=logging.INFO, + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", + stream=sys.stdout, +) +logger = logging.getLogger(__file__) # Use the latest image from the local docker build DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi") DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None) -HF_TOKEN = os.getenv("HF_TOKEN", None) +HF_TOKEN = huggingface_hub.get_token() assert ( HF_TOKEN is not None @@ -48,12 +56,6 @@ HABANA_RUN_ARGS = { "cap_add": ["sys_nice"], } -logger.add( - sys.stderr, - format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", - level="INFO", -) - def stream_container_logs(container, test_name): """Stream container logs in a separate thread.""" @@ -151,7 +153,7 @@ def data_volume(): @pytest.fixture(scope="module") -def launcher(data_volume): +def gaudi_launcher(event_loop): @contextlib.contextmanager def docker_launcher( model_id: str, @@ -271,7 +273,7 @@ def launcher(data_volume): @pytest.fixture(scope="module") -def generate_load(): +def gaudi_generate_load(): async def generate_load_inner( client: AsyncClient, prompt: str, max_new_tokens: int, n: int ) -> List[Response]: diff --git a/backends/gaudi/server/integration-tests/capture_expected_outputs.py b/integration-tests/gaudi/capture_expected_outputs.py similarity index 98% rename from backends/gaudi/server/integration-tests/capture_expected_outputs.py rename to integration-tests/gaudi/capture_expected_outputs.py index 051b9d698..6a5d4a685 100644 --- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py +++ b/integration-tests/gaudi/capture_expected_outputs.py @@ -3,7 +3,7 @@ import os from typing import Dict, Any, Generator import pytest -from test_model import TEST_CONFIGS +from test_generate import TEST_CONFIGS UNKNOWN_CONFIGS = { name: config diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py new file mode 100644 index 000000000..184cbf156 --- /dev/null +++ b/integration-tests/gaudi/test_gaudi_generate.py @@ -0,0 +1,273 @@ +from typing import Any, Dict + +from text_generation import AsyncClient +import pytest + +# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures. +TEST_CONFIGS = { + # "meta-llama/Llama-3.1-8B-Instruct-shared": { + # "model_id": "meta-llama/Llama-3.1-8B-Instruct", + # "input": "What is Deep Learning?", + # "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", + # "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", + # "args": [ + # "--sharded", + # "true", + # "--num-shard", + # "8", + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "8", + # "--max-batch-prefill-tokens", + # "2048", + # ], + # }, + "meta-llama/Llama-3.1-8B-Instruct": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "input": "What is Deep Learning?", + "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", + "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", + "env_config": {}, + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + # "meta-llama/Llama-2-7b-chat-hf": { + # "model_id": "meta-llama/Llama-2-7b-chat-hf", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", + # "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # "--max-batch-prefill-tokens", + # "2048", + # ], + # }, + # "mistralai/Mistral-7B-Instruct-v0.3": { + # "model_id": "mistralai/Mistral-7B-Instruct-v0.3", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", + # "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # "--max-batch-prefill-tokens", + # "2048", + # ], + # }, + # "bigcode/starcoder2-3b": { + # "model_id": "bigcode/starcoder2-3b", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", + # "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # "--max-batch-prefill-tokens", + # "2048", + # ], + # }, + # "google/gemma-7b-it": { + # "model_id": "google/gemma-7b-it", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", + # "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # "--max-batch-prefill-tokens", + # "2048", + # ], + # }, + # "Qwen/Qwen2-0.5B-Instruct": { + # "model_id": "Qwen/Qwen2-0.5B-Instruct", + # "input": "What is Deep Learning?", + # "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", + # "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # "--max-batch-prefill-tokens", + # "2048", + # ], + # }, + # "tiiuae/falcon-7b-instruct": { + # "model_id": "tiiuae/falcon-7b-instruct", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", + # "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # ], + # }, + # "microsoft/phi-1_5": { + # "model_id": "microsoft/phi-1_5", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", + # "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # ], + # }, + # "openai-community/gpt2": { + # "model_id": "openai-community/gpt2", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", + # "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # ], + # }, + # "facebook/opt-125m": { + # "model_id": "facebook/opt-125m", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout", + # "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # ], + # }, + # "EleutherAI/gpt-j-6b": { + # "model_id": "EleutherAI/gpt-j-6b", + # "input": "What is Deep Learning?", + # "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", + # "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", + # "args": [ + # "--max-input-tokens", + # "512", + # "--max-total-tokens", + # "1024", + # "--max-batch-size", + # "4", + # ], + # }, +} + +print(f"Testing {len(TEST_CONFIGS)} models") + + +@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) +def test_config(request) -> Dict[str, Any]: + """Fixture that provides model configurations for testing.""" + test_config = TEST_CONFIGS[request.param] + test_config["test_name"] = request.param + return test_config + + +@pytest.fixture(scope="module") +def model_id(test_config): + yield test_config["model_id"] + + +@pytest.fixture(scope="module") +def test_name(test_config): + yield test_config["test_name"] + + +@pytest.fixture(scope="module") +def expected_outputs(test_config): + return { + "greedy": test_config["expected_greedy_output"], + # "sampling": model_config["expected_sampling_output"], + "batch": test_config["expected_batch_output"], + } + + +@pytest.fixture(scope="module") +def input(test_config): + return test_config["input"] + + +@pytest.fixture(scope="module") +def tgi_service(gaudi_launcher, model_id, test_name): + with gaudi_launcher(model_id, test_name) as tgi_service: + yield tgi_service + + +@pytest.fixture(scope="module") +async def tgi_client(tgi_service) -> AsyncClient: + await tgi_service.health(1000) + return tgi_service.client + + +@pytest.mark.asyncio +async def test_model_single_request( + tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str +): + # Bounded greedy decoding without input + response = await tgi_client.generate( + input, + max_new_tokens=32, + ) + assert response.details.generated_tokens == 32 + assert response.generated_text == expected_outputs["greedy"] + + +@pytest.mark.asyncio +async def test_model_multiple_requests( + tgi_client, gaudi_generate_load, expected_outputs, input +): + num_requests = 4 + responses = await gaudi_generate_load( + tgi_client, + input, + max_new_tokens=32, + n=num_requests, + ) + + assert len(responses) == 4 + expected = expected_outputs["batch"] + for r in responses: + assert r.details.generated_tokens == 32 + assert r.generated_text == expected