feat(ci): llama3 test working

2025-07-01 21:40:16 +00:00 · 2025-04-10 08:32:28 +00:00 · 2025-04-10 08:32:28 +00:00 · e024f1dd22
commit e024f1dd22
parent 23fe77f059
4 changed files with 11 additions and 279 deletions
--- a/backends/gaudi/Makefile
+++ b/backends/gaudi/Makefile
@ -52,7 +52,7 @@ local-dev-install: install-dependencies
 run-integration-tests:
 	DOCKER_VOLUME=${root_dir}/data \
 	HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
-    pytest --durations=0 -s -vv integration-tests --gaudi
+    pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
 # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
 capture-expected-outputs-for-integration-tests:
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -1,4 +1,8 @@
-pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
+pytest_plugins = [
    "fixtures.neuron.service",
    "fixtures.neuron.export_models",
    "fixtures.gaudi.service",
 ]
 # ruff: noqa: E402
 from _pytest.fixtures import SubRequest
 from huggingface_hub.inference._generated.types.chat_completion import (
--- a/integration-tests/fixtures/gaudi/service.py
+++ b/integration-tests/fixtures/gaudi/service.py
@ -15,9 +15,10 @@ import pytest
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 import logging
-from gaudi.test_generate import TEST_CONFIGS
+from gaudi.test_gaudi_generate import TEST_CONFIGS
 from text_generation import AsyncClient
 from text_generation.types import Response
 import huggingface_hub
 logging.basicConfig(
    level=logging.INFO,
@ -29,7 +30,7 @@ logger = logging.getLogger(__file__)
 # Use the latest image from the local docker build
 DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
 DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
-HF_TOKEN = os.getenv("HF_TOKEN", None)
+HF_TOKEN = huggingface_hub.get_token()
 assert (
    HF_TOKEN is not None
@ -152,7 +153,7 @@ def data_volume():
@pytest.fixture(scope="module")
-def launcher(data_volume):
+def gaudi_launcher(event_loop):
    @contextlib.contextmanager
    def docker_launcher(
        model_id: str,
@ -272,7 +273,7 @@ def launcher(data_volume):
@pytest.fixture(scope="module")
-def generate_load():
+def gaudi_generate_load():
    async def generate_load_inner(
        client: AsyncClient, prompt: str, max_new_tokens: int, n: int
    ) -> List[Response]:
--- a/integration-tests/gaudi/test_model.py
+++ b/integration-tests/gaudi/test_model.py
@ -1,273 +0,0 @@
 from typing import Any, Dict
 from text_generation import AsyncClient
 import pytest
 # The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
 TEST_CONFIGS = {
    "meta-llama/Llama-3.1-8B-Instruct-shared": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
        "args": [
            "--sharded",
            "true",
            "--num-shard",
            "8",
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "8",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "meta-llama/Llama-3.1-8B-Instruct": {
        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
        "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
        "env_config": {},
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "meta-llama/Llama-2-7b-chat-hf": {
        "model_id": "meta-llama/Llama-2-7b-chat-hf",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
        "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "mistralai/Mistral-7B-Instruct-v0.3": {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "bigcode/starcoder2-3b": {
        "model_id": "bigcode/starcoder2-3b",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "google/gemma-7b-it": {
        "model_id": "google/gemma-7b-it",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "Qwen/Qwen2-0.5B-Instruct": {
        "model_id": "Qwen/Qwen2-0.5B-Instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
        "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
            "--max-batch-prefill-tokens",
            "2048",
        ],
    },
    "tiiuae/falcon-7b-instruct": {
        "model_id": "tiiuae/falcon-7b-instruct",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
        "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "microsoft/phi-1_5": {
        "model_id": "microsoft/phi-1_5",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
        "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "openai-community/gpt2": {
        "model_id": "openai-community/gpt2",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
        "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "facebook/opt-125m": {
        "model_id": "facebook/opt-125m",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
        "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
    "EleutherAI/gpt-j-6b": {
        "model_id": "EleutherAI/gpt-j-6b",
        "input": "What is Deep Learning?",
        "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
        "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
        "args": [
            "--max-input-tokens",
            "512",
            "--max-total-tokens",
            "1024",
            "--max-batch-size",
            "4",
        ],
    },
 }
 print(f"Testing {len(TEST_CONFIGS)} models")
@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
 def test_config(request) -> Dict[str, Any]:
    """Fixture that provides model configurations for testing."""
    test_config = TEST_CONFIGS[request.param]
    test_config["test_name"] = request.param
    return test_config
@pytest.fixture(scope="module")
 def model_id(test_config):
    yield test_config["model_id"]
@pytest.fixture(scope="module")
 def test_name(test_config):
    yield test_config["test_name"]
@pytest.fixture(scope="module")
 def expected_outputs(test_config):
    return {
        "greedy": test_config["expected_greedy_output"],
        # "sampling": model_config["expected_sampling_output"],
        "batch": test_config["expected_batch_output"],
    }
@pytest.fixture(scope="module")
 def input(test_config):
    return test_config["input"]
@pytest.fixture(scope="module")
 def tgi_service(launcher, model_id, test_name):
    with launcher(model_id, test_name) as tgi_service:
        yield tgi_service
@pytest.fixture(scope="module")
 async def tgi_client(tgi_service) -> AsyncClient:
    await tgi_service.health(1000)
    return tgi_service.client
@pytest.mark.asyncio
 async def test_model_single_request(
    tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
 ):
    # Bounded greedy decoding without input
    response = await tgi_client.generate(
        input,
        max_new_tokens=32,
    )
    assert response.details.generated_tokens == 32
    assert response.generated_text == expected_outputs["greedy"]
@pytest.mark.asyncio
 async def test_model_multiple_requests(
    tgi_client, generate_load, expected_outputs, input
 ):
    num_requests = 4
    responses = await generate_load(
        tgi_client,
        input,
        max_new_tokens=32,
        n=num_requests,
    )
    assert len(responses) == 4
    expected = expected_outputs["batch"]
    for r in responses:
        assert r.details.generated_tokens == 32
        assert r.generated_text == expected