Merge dd187d2350 into 4645678ff0

2025-10-20 04:15:23 +00:00 · 2025-04-15 13:23:06 +02:00 · 2025-04-15 13:23:06 +02:00 · 26bc3c3097
commit 26bc3c3097
parent 4645678ff0 dd187d2350
9 changed files with 315 additions and 314 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -129,9 +129,9 @@ jobs:
                export label_extension="-gaudi"
                export docker_volume="/mnt/cache"
                export docker_devices=""
-                export runs_on="ubuntu-latest"
+                export runs_on="aws-dl1-24xlarge"
                export platform=""
-                export extra_pytest=""
+                export extra_pytest="--gaudi"
                export target=""
          esac
          echo $dockerfile
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@ -50,10 +50,9 @@ local-dev-install: install-dependencies

 # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
 run-integration-tests:
-	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
-	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
+    pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi

 # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
 capture-expected-outputs-for-integration-tests:
--- a/backends/gaudi/server/integration-tests/pytest.ini
+++ b/backends/gaudi/server/integration-tests/pytest.ini
@ -1,2 +0,0 @@
-[pytest]
-asyncio_mode = auto
--- a/backends/gaudi/server/integration-tests/requirements.txt
+++ b/backends/gaudi/server/integration-tests/requirements.txt
@ -1,7 +0,0 @@
-pytest >= 8.3.5
-pytest-asyncio >= 0.26.0
-docker >= 7.1.0
-Levenshtein >= 0.27.1
-loguru >= 0.7.3
-aiohttp >= 3.11.14
-text-generation
--- a/backends/gaudi/server/integration-tests/test_model.py
+++ b/backends/gaudi/server/integration-tests/test_model.py
@ -1,276 +0,0 @@
-from typing import Any, Dict
-
-from text_generation import AsyncClient
-import pytest
-from Levenshtein import distance as levenshtein_distance
-
-# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
-TEST_CONFIGS = {
-    "meta-llama/Llama-3.1-8B-Instruct-shared": {
-        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
-        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
-        "args": [
-            "--sharded",
-            "true",
-            "--num-shard",
-            "8",
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "8",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "meta-llama/Llama-3.1-8B-Instruct": {
-        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "env_config": {},
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "meta-llama/Llama-2-7b-chat-hf": {
-        "model_id": "meta-llama/Llama-2-7b-chat-hf",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "mistralai/Mistral-7B-Instruct-v0.3": {
-        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "bigcode/starcoder2-3b": {
-        "model_id": "bigcode/starcoder2-3b",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "google/gemma-7b-it": {
-        "model_id": "google/gemma-7b-it",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "Qwen/Qwen2-0.5B-Instruct": {
-        "model_id": "Qwen/Qwen2-0.5B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "tiiuae/falcon-7b-instruct": {
-        "model_id": "tiiuae/falcon-7b-instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "microsoft/phi-1_5": {
-        "model_id": "microsoft/phi-1_5",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "openai-community/gpt2": {
-        "model_id": "openai-community/gpt2",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "facebook/opt-125m": {
-        "model_id": "facebook/opt-125m",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
-        "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "EleutherAI/gpt-j-6b": {
-        "model_id": "EleutherAI/gpt-j-6b",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-}
-
-print(f"Testing {len(TEST_CONFIGS)} models")
-
-
-@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
-def test_config(request) -> Dict[str, Any]:
-    """Fixture that provides model configurations for testing."""
-    test_config = TEST_CONFIGS[request.param]
-    test_config["test_name"] = request.param
-    return test_config
-
-
-@pytest.fixture(scope="module")
-def model_id(test_config):
-    yield test_config["model_id"]
-
-
-@pytest.fixture(scope="module")
-def test_name(test_config):
-    yield test_config["test_name"]
-
-
-@pytest.fixture(scope="module")
-def expected_outputs(test_config):
-    return {
-        "greedy": test_config["expected_greedy_output"],
-        # "sampling": model_config["expected_sampling_output"],
-        "batch": test_config["expected_batch_output"],
-    }
-
-
-@pytest.fixture(scope="module")
-def input(test_config):
-    return test_config["input"]
-
-
-@pytest.fixture(scope="module")
-def tgi_service(launcher, model_id, test_name):
-    with launcher(model_id, test_name) as tgi_service:
-        yield tgi_service
-
-
-@pytest.fixture(scope="module")
-async def tgi_client(tgi_service) -> AsyncClient:
-    await tgi_service.health(1000)
-    return tgi_service.client
-
-
-@pytest.mark.asyncio
-async def test_model_single_request(
-    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
-):
-    # Bounded greedy decoding without input
-    response = await tgi_client.generate(
-        input,
-        max_new_tokens=32,
-    )
-    assert response.details.generated_tokens == 32
-    assert response.generated_text == expected_outputs["greedy"]
-
-
-@pytest.mark.asyncio
-async def test_model_multiple_requests(
-    tgi_client, generate_load, expected_outputs, input
-):
-    num_requests = 4
-    responses = await generate_load(
-        tgi_client,
-        input,
-        max_new_tokens=32,
-        n=num_requests,
-    )
-
-    assert len(responses) == 4
-    expected = expected_outputs["batch"]
-    for r in responses:
-        assert r.details.generated_tokens == 32
-        # Compute the similarity with the expectation using the levenshtein distance
-        # We should not have more than two substitutions or additions
-        assert levenshtein_distance(r.generated_text, expected) < 3
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,4 +1,8 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
+pytest_plugins = [
+    "fixtures.neuron.service",
+    "fixtures.neuron.export_models",
+    "fixtures.gaudi.service",
+]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 from huggingface_hub.inference._generated.types.chat_completion import (
@ -47,7 +51,6 @@ from text_generation.types import (
    ChatComplete,
    ChatCompletionChunk,
    ChatCompletionComplete,
-    Completion,
    Details,
    Grammar,
    InputToken,
@ -68,6 +71,9 @@ def pytest_addoption(parser):
    parser.addoption(
        "--neuron", action="store_true", default=False, help="run neuron tests"
    )
+    parser.addoption(
+        "--gaudi", action="store_true", default=False, help="run gaudi tests"
+    )


 def pytest_configure(config):
@ -84,6 +90,22 @@ def pytest_collection_modifyitems(config, items):
                item.add_marker(pytest.mark.skip(reason="need --release option to run"))

        selectors.append(skip_release)
+
+    if config.getoption("--gaudi"):
+
+        def skip_not_gaudi(item):
+            if "gaudi" not in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
+        selectors.append(skip_not_gaudi)
+    else:
+
+        def skip_gaudi(item):
+            if "gaudi" in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
+        selectors.append(skip_gaudi)
+
    if config.getoption("--neuron"):

        def skip_not_neuron(item):
@ -100,6 +122,7 @@ def pytest_collection_modifyitems(config, items):
                item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))

        selectors.append(skip_neuron)
+
    for item in items:
        for selector in selectors:
            selector(item)
@ -131,7 +154,6 @@ class ResponseComparator(JSONSnapshotExtension):
            or isinstance(data, ChatComplete)
            or isinstance(data, ChatCompletionChunk)
            or isinstance(data, ChatCompletionComplete)
-            or isinstance(data, Completion)
            or isinstance(data, OAIChatCompletionChunk)
            or isinstance(data, OAICompletion)
        ):
@ -188,8 +210,6 @@ class ResponseComparator(JSONSnapshotExtension):
                    if isinstance(choices, List) and len(choices) >= 1:
                        if "delta" in choices[0]:
                            return ChatCompletionChunk(**data)
-                        if "text" in choices[0]:
-                            return Completion(**data)
                    return ChatComplete(**data)
                else:
                    return Response(**data)
@ -282,9 +302,6 @@ class ResponseComparator(JSONSnapshotExtension):
                )
            )

-        def eq_completion(response: Completion, other: Completion) -> bool:
-            return response.choices[0].text == other.choices[0].text
-
        def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
            return (
                response.choices[0].message.content == other.choices[0].message.content
@ -329,11 +346,6 @@ class ResponseComparator(JSONSnapshotExtension):
        if len(serialized_data) == 0:
            return len(snapshot_data) == len(serialized_data)

-        if isinstance(serialized_data[0], Completion):
-            return len(snapshot_data) == len(serialized_data) and all(
-                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
-            )
-
        if isinstance(serialized_data[0], ChatComplete):
            return len(snapshot_data) == len(serialized_data) and all(
                [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
--- a/backends/gaudi/server/integration-tests/conftest.py
+++ b/backends/gaudi/server/integration-tests/conftest.py
@ -14,15 +14,23 @@ import docker
 import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
-from loguru import logger
-from test_model import TEST_CONFIGS
+import logging
+from gaudi.test_gaudi_generate import TEST_CONFIGS
 from text_generation import AsyncClient
 from text_generation.types import Response
+import huggingface_hub
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)

 # Use the latest image from the local docker build
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
-HF_TOKEN = os.getenv("HF_TOKEN", None)
+HF_TOKEN = huggingface_hub.get_token()

 assert (
    HF_TOKEN is not None
@ -48,12 +56,6 @@ HABANA_RUN_ARGS = {
    "cap_add": ["sys_nice"],
 }

-logger.add(
-    sys.stderr,
-    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
-    level="INFO",
-)
-

 def stream_container_logs(container, test_name):
    """Stream container logs in a separate thread."""
@ -151,7 +153,7 @@ def data_volume():


@pytest.fixture(scope="module")
-def launcher(data_volume):
+def gaudi_launcher(event_loop):
    @contextlib.contextmanager
    def docker_launcher(
        model_id: str,
@ -271,7 +273,7 @@ def launcher(data_volume):


@pytest.fixture(scope="module")
-def generate_load():
+def gaudi_generate_load():
    async def generate_load_inner(
        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
    ) -> List[Response]:
--- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py
+++ b/backends/gaudi/server/integration-tests/capture_expected_outputs.py
@ -3,7 +3,7 @@ import os
 from typing import Dict, Any, Generator

 import pytest
-from test_model import TEST_CONFIGS
+from test_generate import TEST_CONFIGS

 UNKNOWN_CONFIGS = {
    name: config
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@ -0,0 +1,273 @@
+from typing import Any, Dict
+
+from text_generation import AsyncClient
+import pytest
+
+# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
+TEST_CONFIGS = {
+    # "meta-llama/Llama-3.1-8B-Instruct-shared": {
+    #     "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+    #     "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+    #     "args": [
+    #         "--sharded",
+    #         "true",
+    #         "--num-shard",
+    #         "8",
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "8",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    "meta-llama/Llama-3.1-8B-Instruct": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "env_config": {},
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    # "meta-llama/Llama-2-7b-chat-hf": {
+    #     "model_id": "meta-llama/Llama-2-7b-chat-hf",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+    #     "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "mistralai/Mistral-7B-Instruct-v0.3": {
+    #     "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "bigcode/starcoder2-3b": {
+    #     "model_id": "bigcode/starcoder2-3b",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "google/gemma-7b-it": {
+    #     "model_id": "google/gemma-7b-it",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "Qwen/Qwen2-0.5B-Instruct": {
+    #     "model_id": "Qwen/Qwen2-0.5B-Instruct",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+    #     "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "tiiuae/falcon-7b-instruct": {
+    #     "model_id": "tiiuae/falcon-7b-instruct",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+    #     "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "microsoft/phi-1_5": {
+    #     "model_id": "microsoft/phi-1_5",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+    #     "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "openai-community/gpt2": {
+    #     "model_id": "openai-community/gpt2",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+    #     "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "facebook/opt-125m": {
+    #     "model_id": "facebook/opt-125m",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+    #     "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "EleutherAI/gpt-j-6b": {
+    #     "model_id": "EleutherAI/gpt-j-6b",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+}
+
+print(f"Testing {len(TEST_CONFIGS)} models")
+
+
+@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
+def test_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    test_config = TEST_CONFIGS[request.param]
+    test_config["test_name"] = request.param
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def model_id(test_config):
+    yield test_config["model_id"]
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config):
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def expected_outputs(test_config):
+    return {
+        "greedy": test_config["expected_greedy_output"],
+        # "sampling": model_config["expected_sampling_output"],
+        "batch": test_config["expected_batch_output"],
+    }
+
+
+@pytest.fixture(scope="module")
+def input(test_config):
+    return test_config["input"]
+
+
+@pytest.fixture(scope="module")
+def tgi_service(gaudi_launcher, model_id, test_name):
+    with gaudi_launcher(model_id, test_name) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service) -> AsyncClient:
+    await tgi_service.health(1000)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(
+    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+):
+    # Bounded greedy decoding without input
+    response = await tgi_client.generate(
+        input,
+        max_new_tokens=32,
+    )
+    assert response.details.generated_tokens == 32
+    assert response.generated_text == expected_outputs["greedy"]
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(
+    tgi_client, gaudi_generate_load, expected_outputs, input
+):
+    num_requests = 4
+    responses = await gaudi_generate_load(
+        tgi_client,
+        input,
+        max_new_tokens=32,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = expected_outputs["batch"]
+    for r in responses:
+        assert r.details.generated_tokens == 32
+        assert r.generated_text == expected