From 4e40467c6d6979dff2927cc853a10ad692972824 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Thu, 10 Apr 2025 07:46:59 +0000
Subject: [PATCH 01/19] wip(test): adding test to ci

---
 .github/workflows/build.yaml                  |  4 +--
 backends/gaudi/Makefile                       |  2 +-
 .../gaudi/server/integration-tests/pytest.ini |  2 --
 .../server/integration-tests/requirements.txt |  7 -----
 integration-tests/conftest.py                 | 28 +++++++++++--------
 .../fixtures/gaudi/service.py                 | 17 +++++------
 .../gaudi}/capture_expected_outputs.py        |  2 +-
 .../gaudi}/test_model.py                      |  5 +---
 8 files changed, 30 insertions(+), 37 deletions(-)
 delete mode 100644 backends/gaudi/server/integration-tests/pytest.ini
 delete mode 100644 backends/gaudi/server/integration-tests/requirements.txt
 rename backends/gaudi/server/integration-tests/conftest.py => integration-tests/fixtures/gaudi/service.py (98%)
 rename {backends/gaudi/server/integration-tests => integration-tests/gaudi}/capture_expected_outputs.py (98%)
 rename {backends/gaudi/server/integration-tests => integration-tests/gaudi}/test_model.py (97%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index a87191c2..59fd66ce 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -129,9 +129,9 @@ jobs:
                 export label_extension="-gaudi"
                 export docker_volume="/mnt/cache"
                 export docker_devices=""
-                export runs_on="ubuntu-latest"
+                export runs_on="aws-dl1-24xlarge"
                 export platform=""
-                export extra_pytest=""
+                export extra_pytest="--gaudi"
                 export target=""
           esac
           echo $dockerfile
diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index e135f16e..7652a7d2 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -54,7 +54,7 @@ run-integration-tests:
 	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
-	uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
+    pytest --durations=0 -s -vv integration-tests --gaudi
 
 # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
 capture-expected-outputs-for-integration-tests:
diff --git a/backends/gaudi/server/integration-tests/pytest.ini b/backends/gaudi/server/integration-tests/pytest.ini
deleted file mode 100644
index 2f4c80e3..00000000
--- a/backends/gaudi/server/integration-tests/pytest.ini
+++ /dev/null
@@ -1,2 +0,0 @@
-[pytest]
-asyncio_mode = auto
diff --git a/backends/gaudi/server/integration-tests/requirements.txt b/backends/gaudi/server/integration-tests/requirements.txt
deleted file mode 100644
index b67d2d8c..00000000
--- a/backends/gaudi/server/integration-tests/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-pytest >= 8.3.5
-pytest-asyncio >= 0.26.0
-docker >= 7.1.0
-Levenshtein >= 0.27.1
-loguru >= 0.7.3
-aiohttp >= 3.11.14
-text-generation
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index f7852441..b9466ae3 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -47,7 +47,6 @@ from text_generation.types import (
     ChatComplete,
     ChatCompletionChunk,
     ChatCompletionComplete,
-    Completion,
     Details,
     Grammar,
     InputToken,
@@ -68,6 +67,9 @@ def pytest_addoption(parser):
     parser.addoption(
         "--neuron", action="store_true", default=False, help="run neuron tests"
     )
+    parser.addoption(
+        "--gaudi", action="store_true", default=False, help="run gaudi tests"
+    )
 
 
 def pytest_configure(config):
@@ -84,6 +86,14 @@ def pytest_collection_modifyitems(config, items):
                 item.add_marker(pytest.mark.skip(reason="need --release option to run"))
 
         selectors.append(skip_release)
+
+    if config.getoption("--gaudi"):
+
+        def skip_not_gaudi(item):
+            if "gaudi" not in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
+        selectors.append(skip_not_gaudi)
     if config.getoption("--neuron"):
 
         def skip_not_neuron(item):
@@ -99,7 +109,12 @@ def pytest_collection_modifyitems(config, items):
             if "neuron" in item.keywords:
                 item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
 
+        def skip_gaudi(item):
+            if "gaudi" in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
         selectors.append(skip_neuron)
+        selectors.append(skip_gaudi)
     for item in items:
         for selector in selectors:
             selector(item)
@@ -131,7 +146,6 @@ class ResponseComparator(JSONSnapshotExtension):
             or isinstance(data, ChatComplete)
             or isinstance(data, ChatCompletionChunk)
             or isinstance(data, ChatCompletionComplete)
-            or isinstance(data, Completion)
             or isinstance(data, OAIChatCompletionChunk)
             or isinstance(data, OAICompletion)
         ):
@@ -188,8 +202,6 @@ class ResponseComparator(JSONSnapshotExtension):
                     if isinstance(choices, List) and len(choices) >= 1:
                         if "delta" in choices[0]:
                             return ChatCompletionChunk(**data)
-                        if "text" in choices[0]:
-                            return Completion(**data)
                     return ChatComplete(**data)
                 else:
                     return Response(**data)
@@ -282,9 +294,6 @@ class ResponseComparator(JSONSnapshotExtension):
                 )
             )
 
-        def eq_completion(response: Completion, other: Completion) -> bool:
-            return response.choices[0].text == other.choices[0].text
-
         def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
             return (
                 response.choices[0].message.content == other.choices[0].message.content
@@ -329,11 +338,6 @@ class ResponseComparator(JSONSnapshotExtension):
         if len(serialized_data) == 0:
             return len(snapshot_data) == len(serialized_data)
 
-        if isinstance(serialized_data[0], Completion):
-            return len(snapshot_data) == len(serialized_data) and all(
-                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
-            )
-
         if isinstance(serialized_data[0], ChatComplete):
             return len(snapshot_data) == len(serialized_data) and all(
                 [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
diff --git a/backends/gaudi/server/integration-tests/conftest.py b/integration-tests/fixtures/gaudi/service.py
similarity index 98%
rename from backends/gaudi/server/integration-tests/conftest.py
rename to integration-tests/fixtures/gaudi/service.py
index c7daf70e..6b39a1e6 100644
--- a/backends/gaudi/server/integration-tests/conftest.py
+++ b/integration-tests/fixtures/gaudi/service.py
@@ -14,11 +14,18 @@ import docker
 import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
-from loguru import logger
-from test_model import TEST_CONFIGS
+import logging
+from gaudi.test_generate import TEST_CONFIGS
 from text_generation import AsyncClient
 from text_generation.types import Response
 
+logging.basicConfig(
+    level=logging.INFO,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
 # Use the latest image from the local docker build
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
@@ -48,12 +55,6 @@ HABANA_RUN_ARGS = {
     "cap_add": ["sys_nice"],
 }
 
-logger.add(
-    sys.stderr,
-    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
-    level="INFO",
-)
-
 
 def stream_container_logs(container, test_name):
     """Stream container logs in a separate thread."""
diff --git a/backends/gaudi/server/integration-tests/capture_expected_outputs.py b/integration-tests/gaudi/capture_expected_outputs.py
similarity index 98%
rename from backends/gaudi/server/integration-tests/capture_expected_outputs.py
rename to integration-tests/gaudi/capture_expected_outputs.py
index 051b9d69..6a5d4a68 100644
--- a/backends/gaudi/server/integration-tests/capture_expected_outputs.py
+++ b/integration-tests/gaudi/capture_expected_outputs.py
@@ -3,7 +3,7 @@ import os
 from typing import Dict, Any, Generator
 
 import pytest
-from test_model import TEST_CONFIGS
+from test_generate import TEST_CONFIGS
 
 UNKNOWN_CONFIGS = {
     name: config
diff --git a/backends/gaudi/server/integration-tests/test_model.py b/integration-tests/gaudi/test_model.py
similarity index 97%
rename from backends/gaudi/server/integration-tests/test_model.py
rename to integration-tests/gaudi/test_model.py
index 40b27164..407bccc2 100644
--- a/backends/gaudi/server/integration-tests/test_model.py
+++ b/integration-tests/gaudi/test_model.py
@@ -2,7 +2,6 @@ from typing import Any, Dict
 
 from text_generation import AsyncClient
 import pytest
-from Levenshtein import distance as levenshtein_distance
 
 # The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
 TEST_CONFIGS = {
@@ -257,6 +256,4 @@ async def test_model_multiple_requests(
     expected = expected_outputs["batch"]
     for r in responses:
         assert r.details.generated_tokens == 32
-        # Compute the similarity with the expectation using the levenshtein distance
-        # We should not have more than two substitutions or additions
-        assert levenshtein_distance(r.generated_text, expected) < 3
+        assert r.generated_text == expected

From b4917f67e49f97bdf36c1a6dc5c4a362973b73ef Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Thu, 10 Apr 2025 07:52:20 +0000
Subject: [PATCH 02/19] wip: able to launch gaudi tests

---
 integration-tests/conftest.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index b9466ae3..84d24637 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -94,6 +94,14 @@ def pytest_collection_modifyitems(config, items):
                 item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
 
         selectors.append(skip_not_gaudi)
+    else:
+
+        def skip_gaudi(item):
+            if "gaudi" in item.keywords:
+                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
+
+        selectors.append(skip_gaudi)
+
     if config.getoption("--neuron"):
 
         def skip_not_neuron(item):
@@ -109,12 +117,8 @@ def pytest_collection_modifyitems(config, items):
             if "neuron" in item.keywords:
                 item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
 
-        def skip_gaudi(item):
-            if "gaudi" in item.keywords:
-                item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
-
         selectors.append(skip_neuron)
-        selectors.append(skip_gaudi)
+
     for item in items:
         for selector in selectors:
             selector(item)

From 7779d0c786e3056a29af08c83f4056460dd90120 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Thu, 10 Apr 2025 08:32:28 +0000
Subject: [PATCH 03/19] feat(ci): llama3 test working

---
 backends/gaudi/Makefile                     | 2 +-
 integration-tests/conftest.py               | 6 +++++-
 integration-tests/fixtures/gaudi/service.py | 9 +++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index 7652a7d2..3ece5a7e 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -54,7 +54,7 @@ run-integration-tests:
 	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
-    pytest --durations=0 -s -vv integration-tests --gaudi
+    pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
 
 # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
 capture-expected-outputs-for-integration-tests:
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 84d24637..594ffd49 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -1,4 +1,8 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
+pytest_plugins = [
+    "fixtures.neuron.service",
+    "fixtures.neuron.export_models",
+    "fixtures.gaudi.service",
+]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 from huggingface_hub.inference._generated.types.chat_completion import (
diff --git a/integration-tests/fixtures/gaudi/service.py b/integration-tests/fixtures/gaudi/service.py
index 6b39a1e6..44c7f999 100644
--- a/integration-tests/fixtures/gaudi/service.py
+++ b/integration-tests/fixtures/gaudi/service.py
@@ -15,9 +15,10 @@ import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 import logging
-from gaudi.test_generate import TEST_CONFIGS
+from gaudi.test_gaudi_generate import TEST_CONFIGS
 from text_generation import AsyncClient
 from text_generation.types import Response
+import huggingface_hub
 
 logging.basicConfig(
     level=logging.INFO,
@@ -29,7 +30,7 @@ logger = logging.getLogger(__file__)
 # Use the latest image from the local docker build
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
-HF_TOKEN = os.getenv("HF_TOKEN", None)
+HF_TOKEN = huggingface_hub.get_token()
 
 assert (
     HF_TOKEN is not None
@@ -152,7 +153,7 @@ def data_volume():
 
 
 @pytest.fixture(scope="module")
-def launcher(data_volume):
+def gaudi_launcher(event_loop):
     @contextlib.contextmanager
     def docker_launcher(
         model_id: str,
@@ -272,7 +273,7 @@ def launcher(data_volume):
 
 
 @pytest.fixture(scope="module")
-def generate_load():
+def gaudi_generate_load():
     async def generate_load_inner(
         client: AsyncClient, prompt: str, max_new_tokens: int, n: int
     ) -> List[Response]:

From 781dd203e96379b55d825073a824c655bb43d0ce Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Thu, 10 Apr 2025 08:32:37 +0000
Subject: [PATCH 04/19] feat(ci): llama3 test working

---
 .../gaudi/test_gaudi_generate.py              | 273 ++++++++++++++++++
 1 file changed, 273 insertions(+)
 create mode 100644 integration-tests/gaudi/test_gaudi_generate.py

diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
new file mode 100644
index 00000000..423ac17f
--- /dev/null
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -0,0 +1,273 @@
+from typing import Any, Dict
+
+from text_generation import AsyncClient
+import pytest
+
+# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
+TEST_CONFIGS = {
+    "meta-llama/Llama-3.1-8B-Instruct-shared": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "args": [
+            "--sharded",
+            "true",
+            "--num-shard",
+            "8",
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "8",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "meta-llama/Llama-3.1-8B-Instruct": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "env_config": {},
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "meta-llama/Llama-2-7b-chat-hf": {
+        "model_id": "meta-llama/Llama-2-7b-chat-hf",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "mistralai/Mistral-7B-Instruct-v0.3": {
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "bigcode/starcoder2-3b": {
+        "model_id": "bigcode/starcoder2-3b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "google/gemma-7b-it": {
+        "model_id": "google/gemma-7b-it",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "Qwen/Qwen2-0.5B-Instruct": {
+        "model_id": "Qwen/Qwen2-0.5B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "tiiuae/falcon-7b-instruct": {
+        "model_id": "tiiuae/falcon-7b-instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "microsoft/phi-1_5": {
+        "model_id": "microsoft/phi-1_5",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "openai-community/gpt2": {
+        "model_id": "openai-community/gpt2",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "facebook/opt-125m": {
+        "model_id": "facebook/opt-125m",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+        "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "EleutherAI/gpt-j-6b": {
+        "model_id": "EleutherAI/gpt-j-6b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+}
+
+print(f"Testing {len(TEST_CONFIGS)} models")
+
+
+@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
+def test_config(request) -> Dict[str, Any]:
+    """Fixture that provides model configurations for testing."""
+    test_config = TEST_CONFIGS[request.param]
+    test_config["test_name"] = request.param
+    return test_config
+
+
+@pytest.fixture(scope="module")
+def model_id(test_config):
+    yield test_config["model_id"]
+
+
+@pytest.fixture(scope="module")
+def test_name(test_config):
+    yield test_config["test_name"]
+
+
+@pytest.fixture(scope="module")
+def expected_outputs(test_config):
+    return {
+        "greedy": test_config["expected_greedy_output"],
+        # "sampling": model_config["expected_sampling_output"],
+        "batch": test_config["expected_batch_output"],
+    }
+
+
+@pytest.fixture(scope="module")
+def input(test_config):
+    return test_config["input"]
+
+
+@pytest.fixture(scope="module")
+def tgi_service(gaudi_launcher, model_id, test_name):
+    with gaudi_launcher(model_id, test_name) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service) -> AsyncClient:
+    await tgi_service.health(1000)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(
+    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+):
+    # Bounded greedy decoding without input
+    response = await tgi_client.generate(
+        input,
+        max_new_tokens=32,
+    )
+    assert response.details.generated_tokens == 32
+    assert response.generated_text == expected_outputs["greedy"]
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(
+    tgi_client, gaudi_generate_load, expected_outputs, input
+):
+    num_requests = 4
+    responses = await gaudi_generate_load(
+        tgi_client,
+        input,
+        max_new_tokens=32,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = expected_outputs["batch"]
+    for r in responses:
+        assert r.details.generated_tokens == 32
+        assert r.generated_text == expected

From 8568f910a7caf5437b0496ead59ee6442fa145d0 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Thu, 10 Apr 2025 09:03:49 +0000
Subject: [PATCH 05/19] fix llama failing test

---
 integration-tests/gaudi/test_gaudi_generate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index 423ac17f..d2adf2b2 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -8,8 +8,8 @@ TEST_CONFIGS = {
     "meta-llama/Llama-3.1-8B-Instruct-shared": {
         "model_id": "meta-llama/Llama-3.1-8B-Instruct",
         "input": "What is Deep Learning?",
-        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
-        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
         "args": [
             "--sharded",
             "true",

From 76d155e660f40993275f067b02ba28443d98005a Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Thu, 10 Apr 2025 11:47:40 +0000
Subject: [PATCH 06/19] wip(ci): rerun ci to debug

---
 integration-tests/gaudi/test_gaudi_generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index d2adf2b2..4ed37dad 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -3,7 +3,7 @@ from typing import Any, Dict
 from text_generation import AsyncClient
 import pytest
 
-# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
+# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
 TEST_CONFIGS = {
     "meta-llama/Llama-3.1-8B-Instruct-shared": {
         "model_id": "meta-llama/Llama-3.1-8B-Instruct",

From 1bd2ad9635e423553e26d03179181fc3a6d5cd9f Mon Sep 17 00:00:00 2001
From: Pauline Bailly-Masson <155966238+paulinebm@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:16:14 +0200
Subject: [PATCH 07/19] Update tests.yaml

---
 .github/workflows/tests.yaml | 136 +++++++++++++++++++----------------
 1 file changed, 74 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 3e431c86..128952d6 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -1,67 +1,79 @@
-name: Server Tests
-
+---
+name: Ci-test
 on:
-  pull_request:
-    paths:
-      - ".github/workflows/tests.yaml"
-      - "server/**"
-      - "proto/**"
-      - "router/**"
-      - "launcher/**"
-      - "backends/**"
-      - "Cargo.lock"
-      - "rust-toolchain.toml"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
+  workflow_dispatch:
 
 jobs:
-  run_tests:
+  Ci-test:
     runs-on:
-      group: aws-highmemory-32-plus-priv
+        group: aws-dl1-24xlarge
     steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        id: python
-        with:
-          python-version: 3.11
-      - uses: dtolnay/rust-toolchain@1.85.0
-        with:
-          components: rustfmt, clippy
-      - name: Install Protoc
-        uses: arduino/setup-protoc@v1
-      - name: Clean unused files
-        run: |
-          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
-          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
-      - name: Install
-        run: |
-          sudo apt update
-          sudo apt install python3.11-dev -y
-          pip install -U pip uv
-          uv venv
-          source ./.venv/bin/activate
-          make install-cpu
-      - name: Download locked kernels
-        run: |
-          source ./.venv/bin/activate
-          kernels download server
-      - name: Run server tests
-        run: |
-          source ./.venv/bin/activate
-          uv pip install pytest
-          export HF_TOKEN=${{ secrets.HF_TOKEN }}
-          pytest -s -vv server/tests
-      - name: Pre-commit checks
-        run: |
-          pip install pre-commit
-          pre-commit install
-          pre-commit run --all-files
-      - name: Run Rust tests
-        run: |
-          cargo test
-      - name: Run Rust tests with google feature
-        run: |
-          cargo test --features google
+    - run: |
+        echo "🎉 This job uses runner scale set runners!"
+# name: Server Tests
+
+# on:
+#   pull_request:
+#     paths:
+#       - ".github/workflows/tests.yaml"
+#       - "server/**"
+#       - "proto/**"
+#       - "router/**"
+#       - "launcher/**"
+#       - "backends/**"
+#       - "Cargo.lock"
+#       - "rust-toolchain.toml"
+
+# concurrency:
+#   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+#   cancel-in-progress: true
+
+# jobs:
+#   run_tests:
+#     runs-on:
+#       group: aws-highmemory-32-plus-priv
+#     steps:
+#       - uses: actions/checkout@v4
+#       - name: Set up Python
+#         uses: actions/setup-python@v4
+#         id: python
+#         with:
+#           python-version: 3.11
+#       - uses: dtolnay/rust-toolchain@1.85.0
+#         with:
+#           components: rustfmt, clippy
+#       - name: Install Protoc
+#         uses: arduino/setup-protoc@v1
+#       - name: Clean unused files
+#         run: |
+#           sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+#           sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
+#       - name: Install
+#         run: |
+#           sudo apt update
+#           sudo apt install python3.11-dev -y
+#           pip install -U pip uv
+#           uv venv
+#           source ./.venv/bin/activate
+#           make install-cpu
+#       - name: Download locked kernels
+#         run: |
+#           source ./.venv/bin/activate
+#           kernels download server
+#       - name: Run server tests
+#         run: |
+#           source ./.venv/bin/activate
+#           uv pip install pytest
+#           export HF_TOKEN=${{ secrets.HF_TOKEN }}
+#           pytest -s -vv server/tests
+#       - name: Pre-commit checks
+#         run: |
+#           pip install pre-commit
+#           pre-commit install
+#           pre-commit run --all-files
+#       - name: Run Rust tests
+#         run: |
+#           cargo test
+#       - name: Run Rust tests with google feature
+#         run: |
+#           cargo test --features google

From 2c2cfc09c5b7024e06ce788ba4d74021be17d242 Mon Sep 17 00:00:00 2001
From: Pauline Bailly-Masson <155966238+paulinebm@users.noreply.github.com>
Date: Thu, 10 Apr 2025 15:23:17 +0200
Subject: [PATCH 08/19] Update tests.yaml

---
 .github/workflows/tests.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 128952d6..70b2a536 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -1,7 +1,7 @@
 ---
 name: Ci-test
 on:
-  workflow_dispatch:
+  push:
 
 jobs:
   Ci-test:

From 4b5e812fe12b961b48da103326b89650ea39fde4 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <collebaptiste@gmail.com>
Date: Thu, 10 Apr 2025 16:08:06 +0200
Subject: [PATCH 09/19] wip(ci): debug the ci

---
 .github/workflows/tests.yaml | 136 ++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 74 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 70b2a536..3e431c86 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -1,79 +1,67 @@
----
-name: Ci-test
+name: Server Tests
+
 on:
-  push:
+  pull_request:
+    paths:
+      - ".github/workflows/tests.yaml"
+      - "server/**"
+      - "proto/**"
+      - "router/**"
+      - "launcher/**"
+      - "backends/**"
+      - "Cargo.lock"
+      - "rust-toolchain.toml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
 
 jobs:
-  Ci-test:
+  run_tests:
     runs-on:
-        group: aws-dl1-24xlarge
+      group: aws-highmemory-32-plus-priv
     steps:
-    - run: |
-        echo "🎉 This job uses runner scale set runners!"
-# name: Server Tests
-
-# on:
-#   pull_request:
-#     paths:
-#       - ".github/workflows/tests.yaml"
-#       - "server/**"
-#       - "proto/**"
-#       - "router/**"
-#       - "launcher/**"
-#       - "backends/**"
-#       - "Cargo.lock"
-#       - "rust-toolchain.toml"
-
-# concurrency:
-#   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-#   cancel-in-progress: true
-
-# jobs:
-#   run_tests:
-#     runs-on:
-#       group: aws-highmemory-32-plus-priv
-#     steps:
-#       - uses: actions/checkout@v4
-#       - name: Set up Python
-#         uses: actions/setup-python@v4
-#         id: python
-#         with:
-#           python-version: 3.11
-#       - uses: dtolnay/rust-toolchain@1.85.0
-#         with:
-#           components: rustfmt, clippy
-#       - name: Install Protoc
-#         uses: arduino/setup-protoc@v1
-#       - name: Clean unused files
-#         run: |
-#           sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
-#           sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
-#       - name: Install
-#         run: |
-#           sudo apt update
-#           sudo apt install python3.11-dev -y
-#           pip install -U pip uv
-#           uv venv
-#           source ./.venv/bin/activate
-#           make install-cpu
-#       - name: Download locked kernels
-#         run: |
-#           source ./.venv/bin/activate
-#           kernels download server
-#       - name: Run server tests
-#         run: |
-#           source ./.venv/bin/activate
-#           uv pip install pytest
-#           export HF_TOKEN=${{ secrets.HF_TOKEN }}
-#           pytest -s -vv server/tests
-#       - name: Pre-commit checks
-#         run: |
-#           pip install pre-commit
-#           pre-commit install
-#           pre-commit run --all-files
-#       - name: Run Rust tests
-#         run: |
-#           cargo test
-#       - name: Run Rust tests with google feature
-#         run: |
-#           cargo test --features google
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        id: python
+        with:
+          python-version: 3.11
+      - uses: dtolnay/rust-toolchain@1.85.0
+        with:
+          components: rustfmt, clippy
+      - name: Install Protoc
+        uses: arduino/setup-protoc@v1
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
+      - name: Install
+        run: |
+          sudo apt update
+          sudo apt install python3.11-dev -y
+          pip install -U pip uv
+          uv venv
+          source ./.venv/bin/activate
+          make install-cpu
+      - name: Download locked kernels
+        run: |
+          source ./.venv/bin/activate
+          kernels download server
+      - name: Run server tests
+        run: |
+          source ./.venv/bin/activate
+          uv pip install pytest
+          export HF_TOKEN=${{ secrets.HF_TOKEN }}
+          pytest -s -vv server/tests
+      - name: Pre-commit checks
+        run: |
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
+      - name: Run Rust tests
+        run: |
+          cargo test
+      - name: Run Rust tests with google feature
+        run: |
+          cargo test --features google

From a2a5772cd719883dcba61a30730b51e6d23b3d71 Mon Sep 17 00:00:00 2001
From: Baptiste Colle <collebaptiste@gmail.com>
Date: Thu, 10 Apr 2025 17:17:16 +0200
Subject: [PATCH 10/19] wip(ci): debug the ci

---
 .../gaudi/test_gaudi_generate.py              | 340 +++++++++---------
 1 file changed, 170 insertions(+), 170 deletions(-)

diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index 4ed37dad..184cbf15 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -5,26 +5,26 @@ import pytest
 
 # The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
 TEST_CONFIGS = {
-    "meta-llama/Llama-3.1-8B-Instruct-shared": {
-        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "args": [
-            "--sharded",
-            "true",
-            "--num-shard",
-            "8",
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "8",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
+    # "meta-llama/Llama-3.1-8B-Instruct-shared": {
+    #     "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+    #     "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+    #     "args": [
+    #         "--sharded",
+    #         "true",
+    #         "--num-shard",
+    #         "8",
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "8",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
     "meta-llama/Llama-3.1-8B-Instruct": {
         "model_id": "meta-llama/Llama-3.1-8B-Instruct",
         "input": "What is Deep Learning?",
@@ -42,156 +42,156 @@ TEST_CONFIGS = {
             "2048",
         ],
     },
-    "meta-llama/Llama-2-7b-chat-hf": {
-        "model_id": "meta-llama/Llama-2-7b-chat-hf",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "mistralai/Mistral-7B-Instruct-v0.3": {
-        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "bigcode/starcoder2-3b": {
-        "model_id": "bigcode/starcoder2-3b",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "google/gemma-7b-it": {
-        "model_id": "google/gemma-7b-it",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "Qwen/Qwen2-0.5B-Instruct": {
-        "model_id": "Qwen/Qwen2-0.5B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "tiiuae/falcon-7b-instruct": {
-        "model_id": "tiiuae/falcon-7b-instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "microsoft/phi-1_5": {
-        "model_id": "microsoft/phi-1_5",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "openai-community/gpt2": {
-        "model_id": "openai-community/gpt2",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "facebook/opt-125m": {
-        "model_id": "facebook/opt-125m",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
-        "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "EleutherAI/gpt-j-6b": {
-        "model_id": "EleutherAI/gpt-j-6b",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
+    # "meta-llama/Llama-2-7b-chat-hf": {
+    #     "model_id": "meta-llama/Llama-2-7b-chat-hf",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+    #     "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "mistralai/Mistral-7B-Instruct-v0.3": {
+    #     "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "bigcode/starcoder2-3b": {
+    #     "model_id": "bigcode/starcoder2-3b",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "google/gemma-7b-it": {
+    #     "model_id": "google/gemma-7b-it",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "Qwen/Qwen2-0.5B-Instruct": {
+    #     "model_id": "Qwen/Qwen2-0.5B-Instruct",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+    #     "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #         "--max-batch-prefill-tokens",
+    #         "2048",
+    #     ],
+    # },
+    # "tiiuae/falcon-7b-instruct": {
+    #     "model_id": "tiiuae/falcon-7b-instruct",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+    #     "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "microsoft/phi-1_5": {
+    #     "model_id": "microsoft/phi-1_5",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+    #     "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "openai-community/gpt2": {
+    #     "model_id": "openai-community/gpt2",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+    #     "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "facebook/opt-125m": {
+    #     "model_id": "facebook/opt-125m",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+    #     "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
+    # "EleutherAI/gpt-j-6b": {
+    #     "model_id": "EleutherAI/gpt-j-6b",
+    #     "input": "What is Deep Learning?",
+    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+    #     "args": [
+    #         "--max-input-tokens",
+    #         "512",
+    #         "--max-total-tokens",
+    #         "1024",
+    #         "--max-batch-size",
+    #         "4",
+    #     ],
+    # },
 }
 
 print(f"Testing {len(TEST_CONFIGS)} models")

From 9c6776375eb7ae52f0c16b42acdbebd3b1200360 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Tue, 22 Apr 2025 08:15:11 +0000
Subject: [PATCH 11/19] change defualt behaviour to only run a subset of all
 the models

---
 backends/gaudi/Makefile                       |   5 +
 integration-tests/conftest.py                 |   6 +
 .../gaudi/test_gaudi_generate.py              | 381 +++++++++---------
 3 files changed, 208 insertions(+), 184 deletions(-)

diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index 3ece5a7e..2eb5506f 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -56,6 +56,11 @@ run-integration-tests:
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
     pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
 
+run-integration-tests-with-all-models:
+	DOCKER_VOLUME=${root_dir}/data \
+	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
+	pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi --gaudi-all-models
+
 # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
 capture-expected-outputs-for-integration-tests:
 	pip install -U pip uv
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 594ffd49..534aaaea 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -74,6 +74,12 @@ def pytest_addoption(parser):
     parser.addoption(
         "--gaudi", action="store_true", default=False, help="run gaudi tests"
     )
+    parser.addoption(
+        "--gaudi-all-models",
+        action="store_true",
+        default=False,
+        help="Run tests for all models instead of just the default subset",
+    )
 
 
 def pytest_configure(config):
diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index 184cbf15..c2d768d0 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -1,30 +1,39 @@
-from typing import Any, Dict
+from typing import Any, Dict, Generator
+from _pytest.fixtures import SubRequest
 
 from text_generation import AsyncClient
 import pytest
 
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "gaudi_all_models: mark test to run with all models"
+    )
+
+
 # The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
 TEST_CONFIGS = {
-    # "meta-llama/Llama-3.1-8B-Instruct-shared": {
-    #     "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-    #     "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-    #     "args": [
-    #         "--sharded",
-    #         "true",
-    #         "--num-shard",
-    #         "8",
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "8",
-    #         "--max-batch-prefill-tokens",
-    #         "2048",
-    #     ],
-    # },
+    "meta-llama/Llama-3.1-8B-Instruct-shared": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
+        "args": [
+            "--sharded",
+            "true",
+            "--num-shard",
+            "8",
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "8",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+        "run_by_default": True,
+    },
     "meta-llama/Llama-3.1-8B-Instruct": {
         "model_id": "meta-llama/Llama-3.1-8B-Instruct",
         "input": "What is Deep Learning?",
@@ -41,196 +50,195 @@ TEST_CONFIGS = {
             "--max-batch-prefill-tokens",
             "2048",
         ],
+        "run_by_default": True,
+    },
+    "meta-llama/Llama-2-7b-chat-hf": {
+        "model_id": "meta-llama/Llama-2-7b-chat-hf",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "mistralai/Mistral-7B-Instruct-v0.3": {
+        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "bigcode/starcoder2-3b": {
+        "model_id": "bigcode/starcoder2-3b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "google/gemma-7b-it": {
+        "model_id": "google/gemma-7b-it",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "Qwen/Qwen2-0.5B-Instruct": {
+        "model_id": "Qwen/Qwen2-0.5B-Instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+            "--max-batch-prefill-tokens",
+            "2048",
+        ],
+    },
+    "tiiuae/falcon-7b-instruct": {
+        "model_id": "tiiuae/falcon-7b-instruct",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "microsoft/phi-1_5": {
+        "model_id": "microsoft/phi-1_5",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "openai-community/gpt2": {
+        "model_id": "openai-community/gpt2",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
+    },
+    "EleutherAI/gpt-j-6b": {
+        "model_id": "EleutherAI/gpt-j-6b",
+        "input": "What is Deep Learning?",
+        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
+        "args": [
+            "--max-input-tokens",
+            "512",
+            "--max-total-tokens",
+            "1024",
+            "--max-batch-size",
+            "4",
+        ],
     },
-    # "meta-llama/Llama-2-7b-chat-hf": {
-    #     "model_id": "meta-llama/Llama-2-7b-chat-hf",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-    #     "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #         "--max-batch-prefill-tokens",
-    #         "2048",
-    #     ],
-    # },
-    # "mistralai/Mistral-7B-Instruct-v0.3": {
-    #     "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #         "--max-batch-prefill-tokens",
-    #         "2048",
-    #     ],
-    # },
-    # "bigcode/starcoder2-3b": {
-    #     "model_id": "bigcode/starcoder2-3b",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #         "--max-batch-prefill-tokens",
-    #         "2048",
-    #     ],
-    # },
-    # "google/gemma-7b-it": {
-    #     "model_id": "google/gemma-7b-it",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #         "--max-batch-prefill-tokens",
-    #         "2048",
-    #     ],
-    # },
-    # "Qwen/Qwen2-0.5B-Instruct": {
-    #     "model_id": "Qwen/Qwen2-0.5B-Instruct",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-    #     "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #         "--max-batch-prefill-tokens",
-    #         "2048",
-    #     ],
-    # },
-    # "tiiuae/falcon-7b-instruct": {
-    #     "model_id": "tiiuae/falcon-7b-instruct",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-    #     "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #     ],
-    # },
-    # "microsoft/phi-1_5": {
-    #     "model_id": "microsoft/phi-1_5",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-    #     "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #     ],
-    # },
-    # "openai-community/gpt2": {
-    #     "model_id": "openai-community/gpt2",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-    #     "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #     ],
-    # },
-    # "facebook/opt-125m": {
-    #     "model_id": "facebook/opt-125m",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
-    #     "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #     ],
-    # },
-    # "EleutherAI/gpt-j-6b": {
-    #     "model_id": "EleutherAI/gpt-j-6b",
-    #     "input": "What is Deep Learning?",
-    #     "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-    #     "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-    #     "args": [
-    #         "--max-input-tokens",
-    #         "512",
-    #         "--max-total-tokens",
-    #         "1024",
-    #         "--max-batch-size",
-    #         "4",
-    #     ],
-    # },
 }
 
-print(f"Testing {len(TEST_CONFIGS)} models")
+
+def pytest_generate_tests(metafunc):
+    if "test_config" in metafunc.fixturenames:
+        if metafunc.config.getoption("--gaudi-all-models"):
+            models = list(TEST_CONFIGS.keys())
+        else:
+            models = [
+                name
+                for name, config in TEST_CONFIGS.items()
+                if config.get("run_by_default", False)
+            ]
+        print(f"Testing {len(models)} models")
+        metafunc.parametrize("test_config", models, indirect=True)
 
 
-@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
-def test_config(request) -> Dict[str, Any]:
+@pytest.fixture(scope="module")
+def test_config(request: SubRequest) -> Dict[str, Any]:
     """Fixture that provides model configurations for testing."""
-    test_config = TEST_CONFIGS[request.param]
-    test_config["test_name"] = request.param
+    model_name = request.param
+    test_config = TEST_CONFIGS[model_name]
+    test_config["test_name"] = model_name
     return test_config
 
 
 @pytest.fixture(scope="module")
-def model_id(test_config):
+def model_id(test_config: Dict[str, Any]) -> Generator[str, None, None]:
     yield test_config["model_id"]
 
 
 @pytest.fixture(scope="module")
-def test_name(test_config):
+def test_name(test_config: Dict[str, Any]) -> Generator[str, None, None]:
     yield test_config["test_name"]
 
 
 @pytest.fixture(scope="module")
-def expected_outputs(test_config):
+def expected_outputs(test_config: Dict[str, Any]) -> Dict[str, str]:
     return {
         "greedy": test_config["expected_greedy_output"],
-        # "sampling": model_config["expected_sampling_output"],
         "batch": test_config["expected_batch_output"],
     }
 
 
 @pytest.fixture(scope="module")
-def input(test_config):
+def input(test_config: Dict[str, Any]) -> str:
     return test_config["input"]
 
 
 @pytest.fixture(scope="module")
-def tgi_service(gaudi_launcher, model_id, test_name):
+def tgi_service(gaudi_launcher, model_id: str, test_name: str):
     with gaudi_launcher(model_id, test_name) as tgi_service:
         yield tgi_service
 
@@ -242,8 +250,9 @@ async def tgi_client(tgi_service) -> AsyncClient:
 
 
 @pytest.mark.asyncio
+@pytest.mark.all_models
 async def test_model_single_request(
-    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
+    tgi_client: AsyncClient, expected_outputs: Dict[str, str], input: str
 ):
     # Bounded greedy decoding without input
     response = await tgi_client.generate(
@@ -255,8 +264,12 @@ async def test_model_single_request(
 
 
 @pytest.mark.asyncio
+@pytest.mark.all_models
 async def test_model_multiple_requests(
-    tgi_client, gaudi_generate_load, expected_outputs, input
+    tgi_client: AsyncClient,
+    gaudi_generate_load,
+    expected_outputs: Dict[str, str],
+    input: str,
 ):
     num_requests = 4
     responses = await gaudi_generate_load(

From 59dc8c2699b0e43c33403e6e9d1dfaa0fa4f4680 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Tue, 22 Apr 2025 08:16:17 +0000
Subject: [PATCH 12/19] change defualt behaviour to only run a subset of all
 the models

---
 backends/gaudi/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md
index ba890f0b..fa68c0a9 100644
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@@ -104,11 +104,16 @@ To run the integration tests, you need to first build the image:
 make -C backends/gaudi image
 ```
 
-Then run the following command to run the integration tests:
+Then run the following command to run the integration tests (CI tests):
 ```bash
 make -C backends/gaudi run-integration-tests
 ```
 
+To run the integration tests with all models, you can run the following command:
+```bash
+make -C backends/gaudi run-integration-tests-with-all-models
+```
+
 To capture the expected outputs for the integration tests, you can run the following command:
 ```bash
 make -C backends/gaudi capture-expected-outputs-for-integration-tests

From fcf6870d20976336601545e39a35af4c553b2314 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Tue, 22 Apr 2025 08:43:45 +0000
Subject: [PATCH 13/19] testing

---
 integration-tests/gaudi/test_gaudi_generate.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index c2d768d0..26ba47fe 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -32,7 +32,6 @@ TEST_CONFIGS = {
             "--max-batch-prefill-tokens",
             "2048",
         ],
-        "run_by_default": True,
     },
     "meta-llama/Llama-3.1-8B-Instruct": {
         "model_id": "meta-llama/Llama-3.1-8B-Instruct",

From 9c235f4d66bef213537496e11f36cf5d7fc04168 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Tue, 22 Apr 2025 09:17:44 +0000
Subject: [PATCH 14/19] feat(gaudi/ci): added ci for gaudi device

---
 integration-tests/fixtures/gaudi/service.py       | 15 ++++++++-------
 .../gaudi/capture_expected_outputs.py             |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/integration-tests/fixtures/gaudi/service.py b/integration-tests/fixtures/gaudi/service.py
index 44c7f999..b6942dbe 100644
--- a/integration-tests/fixtures/gaudi/service.py
+++ b/integration-tests/fixtures/gaudi/service.py
@@ -190,11 +190,7 @@ def gaudi_launcher(event_loop):
         except Exception as e:
             logger.error(f"Error handling existing container: {str(e)}")
 
-        model_name = next(
-            name for name, cfg in TEST_CONFIGS.items() if cfg["model_id"] == model_id
-        )
-
-        tgi_args = TEST_CONFIGS[model_name]["args"].copy()
+        tgi_args = TEST_CONFIGS[test_name]["args"].copy()
 
         env = BASE_ENV.copy()
 
@@ -202,15 +198,20 @@ def gaudi_launcher(event_loop):
         env["MODEL_ID"] = model_id
 
         # Add env config that is definied in the fixture parameter
-        if "env_config" in TEST_CONFIGS[model_name]:
-            env.update(TEST_CONFIGS[model_name]["env_config"].copy())
+        if "env_config" in TEST_CONFIGS[test_name]:
+            env.update(TEST_CONFIGS[test_name]["env_config"].copy())
 
         volumes = [f"{DOCKER_VOLUME}:/data"]
         logger.debug(f"Using volume {volumes}")
 
         try:
+            logger.debug(f"Using command {tgi_args}")
             logger.info(f"Creating container with name {container_name}")
 
+            logger.debug(f"Using environment {env}")
+            logger.debug(f"Using volumes {volumes}")
+            logger.debug(f"HABANA_RUN_ARGS {HABANA_RUN_ARGS}")
+
             # Log equivalent docker run command for debugging, this is not actually executed
             container = client.containers.run(
                 DOCKER_IMAGE,
diff --git a/integration-tests/gaudi/capture_expected_outputs.py b/integration-tests/gaudi/capture_expected_outputs.py
index 6a5d4a68..5a5fd179 100644
--- a/integration-tests/gaudi/capture_expected_outputs.py
+++ b/integration-tests/gaudi/capture_expected_outputs.py
@@ -3,7 +3,7 @@ import os
 from typing import Dict, Any, Generator
 
 import pytest
-from test_generate import TEST_CONFIGS
+from test_gaudi_generate import TEST_CONFIGS
 
 UNKNOWN_CONFIGS = {
     name: config

From 8768085c8c085419cb3471e74b2265d434b3d053 Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Wed, 21 May 2025 11:27:11 +0000
Subject: [PATCH 15/19] add new gaudi3 runners

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 59fd66ce..c7ccf764 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -129,7 +129,7 @@ jobs:
                 export label_extension="-gaudi"
                 export docker_volume="/mnt/cache"
                 export docker_devices=""
-                export runs_on="aws-dl1-24xlarge"
+                export runs_on="itac-bm-emr-gaudi3-dell-1gaudi"
                 export platform=""
                 export extra_pytest="--gaudi"
                 export target=""

From 1f03afe94dc2070767aa28d479868408c341890b Mon Sep 17 00:00:00 2001
From: baptiste <baptiste.colle@huggingface.co>
Date: Wed, 21 May 2025 15:28:58 +0000
Subject: [PATCH 16/19] enable multi-card test

---
 .github/workflows/build.yaml                   | 2 +-
 integration-tests/gaudi/test_gaudi_generate.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c7ccf764..14c69a2b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -129,7 +129,7 @@ jobs:
                 export label_extension="-gaudi"
                 export docker_volume="/mnt/cache"
                 export docker_devices=""
-                export runs_on="itac-bm-emr-gaudi3-dell-1gaudi"
+                export runs_on="itac-bm-emr-gaudi3-dell-8gaudi"
                 export platform=""
                 export extra_pytest="--gaudi"
                 export target=""
diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index 26ba47fe..c2d768d0 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -32,6 +32,7 @@ TEST_CONFIGS = {
             "--max-batch-prefill-tokens",
             "2048",
         ],
+        "run_by_default": True,
     },
     "meta-llama/Llama-3.1-8B-Instruct": {
         "model_id": "meta-llama/Llama-3.1-8B-Instruct",

From 0295bf243f38341185b597e7c613d6283f0d7c5d Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Jun 2025 12:10:14 +0000
Subject: [PATCH 17/19] fix broken test

---
 backends/gaudi/Makefile                       |   2 -
 backends/gaudi/README.md                      |   5 +
 integration-tests/fixtures/gaudi/service.py   |  49 ++--
 .../gaudi/test_gaudi_generate.py              |  22 +-
 integration-tests/gaudi/test_model.py         | 259 ------------------
 5 files changed, 52 insertions(+), 285 deletions(-)
 delete mode 100644 integration-tests/gaudi/test_model.py

diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
index 2eb5506f..40d17f61 100644
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@@ -50,8 +50,6 @@ local-dev-install: install-dependencies
 
 # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
 run-integration-tests:
-	pip install -U pip uv
-	uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
     pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
diff --git a/backends/gaudi/README.md b/backends/gaudi/README.md
index fa68c0a9..7713040f 100644
--- a/backends/gaudi/README.md
+++ b/backends/gaudi/README.md
@@ -99,6 +99,11 @@ curl 127.0.0.1:8080/generate \
 
 ### Integration tests
 
+Install the dependencies:
+```bash
+pip install -r integration-tests/requirements.txt
+```
+
 To run the integration tests, you need to first build the image:
 ```bash
 make -C backends/gaudi image
diff --git a/integration-tests/fixtures/gaudi/service.py b/integration-tests/fixtures/gaudi/service.py
index b6942dbe..5c7d729b 100644
--- a/integration-tests/fixtures/gaudi/service.py
+++ b/integration-tests/fixtures/gaudi/service.py
@@ -16,8 +16,7 @@ from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 import logging
 from gaudi.test_gaudi_generate import TEST_CONFIGS
-from text_generation import AsyncClient
-from text_generation.types import Response
+from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
 import huggingface_hub
 
 logging.basicConfig(
@@ -71,9 +70,15 @@ def stream_container_logs(container, test_name):
         logger.error(f"Error streaming container logs: {str(e)}")
 
 
+class TestClient(AsyncInferenceClient):
+    def __init__(self, service_name: str, base_url: str):
+        super().__init__(model=base_url)
+        self.service_name = service_name
+
+
 class LauncherHandle:
-    def __init__(self, port: int):
-        self.client = AsyncClient(f"http://localhost:{port}", timeout=3600)
+    def __init__(self, service_name: str, port: int):
+        self.client = TestClient(service_name, f"http://localhost:{port}")
 
     def _inner_health(self):
         raise NotImplementedError
@@ -89,7 +94,7 @@ class LauncherHandle:
                 raise RuntimeError("Launcher crashed")
 
             try:
-                await self.client.generate("test")
+                await self.client.text_generation("test", max_new_tokens=1)
                 elapsed = time.time() - start_time
                 logger.info(f"Health check passed after {elapsed:.1f}s")
                 return
@@ -113,7 +118,8 @@ class LauncherHandle:
 
 class ContainerLauncherHandle(LauncherHandle):
     def __init__(self, docker_client, container_name, port: int):
-        super(ContainerLauncherHandle, self).__init__(port)
+        service_name = container_name  # Use container name as service name
+        super(ContainerLauncherHandle, self).__init__(service_name, port)
         self.docker_client = docker_client
         self.container_name = container_name
 
@@ -134,7 +140,8 @@ class ContainerLauncherHandle(LauncherHandle):
 
 class ProcessLauncherHandle(LauncherHandle):
     def __init__(self, process, port: int):
-        super(ProcessLauncherHandle, self).__init__(port)
+        service_name = "process"  # Use generic name for process launcher
+        super(ProcessLauncherHandle, self).__init__(service_name, port)
         self.process = process
 
     def _inner_health(self) -> bool:
@@ -153,11 +160,13 @@ def data_volume():
 
 
 @pytest.fixture(scope="module")
-def gaudi_launcher(event_loop):
+def gaudi_launcher():
     @contextlib.contextmanager
     def docker_launcher(
         model_id: str,
         test_name: str,
+        tgi_args: List[str] = None,
+        env_config: dict = None
     ):
         logger.info(
             f"Starting docker launcher for model {model_id} and test {test_name}"
@@ -185,23 +194,30 @@ def gaudi_launcher(event_loop):
             )
             container.stop()
             container.wait()
+            container.remove()
+            logger.info(f"Removed existing container {container_name}")
         except NotFound:
             pass
         except Exception as e:
             logger.error(f"Error handling existing container: {str(e)}")
 
-        tgi_args = TEST_CONFIGS[test_name]["args"].copy()
+        if tgi_args is None:
+            tgi_args = []
+        else:
+            tgi_args = tgi_args.copy()
 
         env = BASE_ENV.copy()
 
         # Add model_id to env
         env["MODEL_ID"] = model_id
 
-        # Add env config that is definied in the fixture parameter
-        if "env_config" in TEST_CONFIGS[test_name]:
-            env.update(TEST_CONFIGS[test_name]["env_config"].copy())
+        # Add env config that is defined in the fixture parameter
+        if env_config is not None:
+            env.update(env_config.copy())
 
-        volumes = [f"{DOCKER_VOLUME}:/data"]
+        volumes = []
+        if DOCKER_VOLUME:
+            volumes = [f"{DOCKER_VOLUME}:/data"]
         logger.debug(f"Using volume {volumes}")
 
         try:
@@ -276,13 +292,14 @@ def gaudi_launcher(event_loop):
 @pytest.fixture(scope="module")
 def gaudi_generate_load():
     async def generate_load_inner(
-        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
-    ) -> List[Response]:
+        client: AsyncInferenceClient, prompt: str, max_new_tokens: int, n: int
+    ) -> List[TextGenerationOutput]:
         try:
             futures = [
-                client.generate(
+                client.text_generation(
                     prompt,
                     max_new_tokens=max_new_tokens,
+                    details=True,
                     decoder_input_details=True,
                 )
                 for _ in range(n)
diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index c2d768d0..f5d71ab7 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, Generator
 from _pytest.fixtures import SubRequest
-
-from text_generation import AsyncClient
+from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
 import pytest
 
 
@@ -238,13 +237,18 @@ def input(test_config: Dict[str, Any]) -> str:
 
 
 @pytest.fixture(scope="module")
-def tgi_service(gaudi_launcher, model_id: str, test_name: str):
-    with gaudi_launcher(model_id, test_name) as tgi_service:
+def tgi_service(gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]):
+    with gaudi_launcher(
+        model_id, 
+        test_name, 
+        tgi_args=test_config.get("args", []),
+        env_config=test_config.get("env_config", {})
+    ) as tgi_service:
         yield tgi_service
 
 
 @pytest.fixture(scope="module")
-async def tgi_client(tgi_service) -> AsyncClient:
+async def tgi_client(tgi_service) -> AsyncInferenceClient:
     await tgi_service.health(1000)
     return tgi_service.client
 
@@ -252,12 +256,14 @@ async def tgi_client(tgi_service) -> AsyncClient:
 @pytest.mark.asyncio
 @pytest.mark.all_models
 async def test_model_single_request(
-    tgi_client: AsyncClient, expected_outputs: Dict[str, str], input: str
+    tgi_client: AsyncInferenceClient, expected_outputs: Dict[str, str], input: str
 ):
     # Bounded greedy decoding without input
-    response = await tgi_client.generate(
+    response = await tgi_client.text_generation(
         input,
         max_new_tokens=32,
+        details=True,
+        decoder_input_details=True,
     )
     assert response.details.generated_tokens == 32
     assert response.generated_text == expected_outputs["greedy"]
@@ -266,7 +272,7 @@ async def test_model_single_request(
 @pytest.mark.asyncio
 @pytest.mark.all_models
 async def test_model_multiple_requests(
-    tgi_client: AsyncClient,
+    tgi_client: AsyncInferenceClient,
     gaudi_generate_load,
     expected_outputs: Dict[str, str],
     input: str,
diff --git a/integration-tests/gaudi/test_model.py b/integration-tests/gaudi/test_model.py
deleted file mode 100644
index 407bccc2..00000000
--- a/integration-tests/gaudi/test_model.py
+++ /dev/null
@@ -1,259 +0,0 @@
-from typing import Any, Dict
-
-from text_generation import AsyncClient
-import pytest
-
-# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
-TEST_CONFIGS = {
-    "meta-llama/Llama-3.1-8B-Instruct-shared": {
-        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "args": [
-            "--sharded",
-            "true",
-            "--num-shard",
-            "8",
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "8",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "meta-llama/Llama-3.1-8B-Instruct": {
-        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
-        "env_config": {},
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "meta-llama/Llama-2-7b-chat-hf": {
-        "model_id": "meta-llama/Llama-2-7b-chat-hf",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "mistralai/Mistral-7B-Instruct-v0.3": {
-        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "bigcode/starcoder2-3b": {
-        "model_id": "bigcode/starcoder2-3b",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "google/gemma-7b-it": {
-        "model_id": "google/gemma-7b-it",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "Qwen/Qwen2-0.5B-Instruct": {
-        "model_id": "Qwen/Qwen2-0.5B-Instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-            "--max-batch-prefill-tokens",
-            "2048",
-        ],
-    },
-    "tiiuae/falcon-7b-instruct": {
-        "model_id": "tiiuae/falcon-7b-instruct",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "microsoft/phi-1_5": {
-        "model_id": "microsoft/phi-1_5",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "openai-community/gpt2": {
-        "model_id": "openai-community/gpt2",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-    "EleutherAI/gpt-j-6b": {
-        "model_id": "EleutherAI/gpt-j-6b",
-        "input": "What is Deep Learning?",
-        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
-        "args": [
-            "--max-input-tokens",
-            "512",
-            "--max-total-tokens",
-            "1024",
-            "--max-batch-size",
-            "4",
-        ],
-    },
-}
-
-print(f"Testing {len(TEST_CONFIGS)} models")
-
-
-@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
-def test_config(request) -> Dict[str, Any]:
-    """Fixture that provides model configurations for testing."""
-    test_config = TEST_CONFIGS[request.param]
-    test_config["test_name"] = request.param
-    return test_config
-
-
-@pytest.fixture(scope="module")
-def model_id(test_config):
-    yield test_config["model_id"]
-
-
-@pytest.fixture(scope="module")
-def test_name(test_config):
-    yield test_config["test_name"]
-
-
-@pytest.fixture(scope="module")
-def expected_outputs(test_config):
-    return {
-        "greedy": test_config["expected_greedy_output"],
-        # "sampling": model_config["expected_sampling_output"],
-        "batch": test_config["expected_batch_output"],
-    }
-
-
-@pytest.fixture(scope="module")
-def input(test_config):
-    return test_config["input"]
-
-
-@pytest.fixture(scope="module")
-def tgi_service(launcher, model_id, test_name):
-    with launcher(model_id, test_name) as tgi_service:
-        yield tgi_service
-
-
-@pytest.fixture(scope="module")
-async def tgi_client(tgi_service) -> AsyncClient:
-    await tgi_service.health(1000)
-    return tgi_service.client
-
-
-@pytest.mark.asyncio
-async def test_model_single_request(
-    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
-):
-    # Bounded greedy decoding without input
-    response = await tgi_client.generate(
-        input,
-        max_new_tokens=32,
-    )
-    assert response.details.generated_tokens == 32
-    assert response.generated_text == expected_outputs["greedy"]
-
-
-@pytest.mark.asyncio
-async def test_model_multiple_requests(
-    tgi_client, generate_load, expected_outputs, input
-):
-    num_requests = 4
-    responses = await generate_load(
-        tgi_client,
-        input,
-        max_new_tokens=32,
-        n=num_requests,
-    )
-
-    assert len(responses) == 4
-    expected = expected_outputs["batch"]
-    for r in responses:
-        assert r.details.generated_tokens == 32
-        assert r.generated_text == expected

From a32025f931f9e27e2595bd11f31fc1ce4dd3cf8e Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Jun 2025 12:26:06 +0000
Subject: [PATCH 18/19] fix style

---
 integration-tests/fixtures/gaudi/service.py    |  3 +--
 integration-tests/gaudi/test_gaudi_generate.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/integration-tests/fixtures/gaudi/service.py b/integration-tests/fixtures/gaudi/service.py
index 5c7d729b..f4f43691 100644
--- a/integration-tests/fixtures/gaudi/service.py
+++ b/integration-tests/fixtures/gaudi/service.py
@@ -15,7 +15,6 @@ import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 import logging
-from gaudi.test_gaudi_generate import TEST_CONFIGS
 from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
 import huggingface_hub
 
@@ -166,7 +165,7 @@ def gaudi_launcher():
         model_id: str,
         test_name: str,
         tgi_args: List[str] = None,
-        env_config: dict = None
+        env_config: dict = None,
     ):
         logger.info(
             f"Starting docker launcher for model {model_id} and test {test_name}"
diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py
index f5d71ab7..2b8b0c76 100644
--- a/integration-tests/gaudi/test_gaudi_generate.py
+++ b/integration-tests/gaudi/test_gaudi_generate.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Generator
 from _pytest.fixtures import SubRequest
-from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
+from huggingface_hub import AsyncInferenceClient
 import pytest
 
 
@@ -237,12 +237,14 @@ def input(test_config: Dict[str, Any]) -> str:
 
 
 @pytest.fixture(scope="module")
-def tgi_service(gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]):
+def tgi_service(
+    gaudi_launcher, model_id: str, test_name: str, test_config: Dict[str, Any]
+):
     with gaudi_launcher(
-        model_id, 
-        test_name, 
+        model_id,
+        test_name,
         tgi_args=test_config.get("args", []),
-        env_config=test_config.get("env_config", {})
+        env_config=test_config.get("env_config", {}),
     ) as tgi_service:
         yield tgi_service
 

From ae7f3aeba1ccfa38d9da86bcdb398f71afc99f41 Mon Sep 17 00:00:00 2001
From: baptiste <collebaptiste@gmail.com>
Date: Mon, 23 Jun 2025 12:27:32 +0000
Subject: [PATCH 19/19] update conftest

---
 integration-tests/conftest.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index 534aaaea..9cc33416 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -51,6 +51,7 @@ from text_generation.types import (
     ChatComplete,
     ChatCompletionChunk,
     ChatCompletionComplete,
+    Completion,
     Details,
     Grammar,
     InputToken,
@@ -160,6 +161,7 @@ class ResponseComparator(JSONSnapshotExtension):
             or isinstance(data, ChatComplete)
             or isinstance(data, ChatCompletionChunk)
             or isinstance(data, ChatCompletionComplete)
+            or isinstance(data, Completion)
             or isinstance(data, OAIChatCompletionChunk)
             or isinstance(data, OAICompletion)
         ):
@@ -216,6 +218,8 @@ class ResponseComparator(JSONSnapshotExtension):
                     if isinstance(choices, List) and len(choices) >= 1:
                         if "delta" in choices[0]:
                             return ChatCompletionChunk(**data)
+                        if "text" in choices[0]:
+                            return Completion(**data)
                     return ChatComplete(**data)
                 else:
                     return Response(**data)
@@ -308,6 +312,9 @@ class ResponseComparator(JSONSnapshotExtension):
                 )
             )
 
+        def eq_completion(response: Completion, other: Completion) -> bool:
+            return response.choices[0].text == other.choices[0].text
+
         def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
             return (
                 response.choices[0].message.content == other.choices[0].message.content
@@ -352,6 +359,11 @@ class ResponseComparator(JSONSnapshotExtension):
         if len(serialized_data) == 0:
             return len(snapshot_data) == len(serialized_data)
 
+        if isinstance(serialized_data[0], Completion):
+            return len(snapshot_data) == len(serialized_data) and all(
+                [eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
+            )
+
         if isinstance(serialized_data[0], ChatComplete):
             return len(snapshot_data) == len(serialized_data) and all(
                 [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]