This commit is contained in:
Baptiste Colle 2025-04-15 13:23:06 +02:00 committed by GitHub
commit 26bc3c3097
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 315 additions and 314 deletions

View File

@ -129,9 +129,9 @@ jobs:
export label_extension="-gaudi" export label_extension="-gaudi"
export docker_volume="/mnt/cache" export docker_volume="/mnt/cache"
export docker_devices="" export docker_devices=""
export runs_on="ubuntu-latest" export runs_on="aws-dl1-24xlarge"
export platform="" export platform=""
export extra_pytest="" export extra_pytest="--gaudi"
export target="" export target=""
esac esac
echo $dockerfile echo $dockerfile

View File

@ -50,10 +50,9 @@ local-dev-install: install-dependencies
# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image) # In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
run-integration-tests: run-integration-tests:
uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
DOCKER_VOLUME=${root_dir}/data \ DOCKER_VOLUME=${root_dir}/data \
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \ HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests # This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
capture-expected-outputs-for-integration-tests: capture-expected-outputs-for-integration-tests:

View File

@ -1,2 +0,0 @@
[pytest]
asyncio_mode = auto

View File

@ -1,7 +0,0 @@
pytest >= 8.3.5
pytest-asyncio >= 0.26.0
docker >= 7.1.0
Levenshtein >= 0.27.1
loguru >= 0.7.3
aiohttp >= 3.11.14
text-generation

View File

@ -1,276 +0,0 @@
from typing import Any, Dict
from text_generation import AsyncClient
import pytest
from Levenshtein import distance as levenshtein_distance
# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
TEST_CONFIGS = {
"meta-llama/Llama-3.1-8B-Instruct-shared": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"input": "What is Deep Learning?",
"expected_greedy_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use",
"expected_batch_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use",
"args": [
"--sharded",
"true",
"--num-shard",
"8",
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"8",
"--max-batch-prefill-tokens",
"2048",
],
},
"meta-llama/Llama-3.1-8B-Instruct": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"input": "What is Deep Learning?",
"expected_greedy_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
"expected_batch_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
"env_config": {},
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
"meta-llama/Llama-2-7b-chat-hf": {
"model_id": "meta-llama/Llama-2-7b-chat-hf",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
"expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
"mistralai/Mistral-7B-Instruct-v0.3": {
"model_id": "mistralai/Mistral-7B-Instruct-v0.3",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
"expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
"bigcode/starcoder2-3b": {
"model_id": "bigcode/starcoder2-3b",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
"expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
"google/gemma-7b-it": {
"model_id": "google/gemma-7b-it",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
"expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
"Qwen/Qwen2-0.5B-Instruct": {
"model_id": "Qwen/Qwen2-0.5B-Instruct",
"input": "What is Deep Learning?",
"expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
"expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
"tiiuae/falcon-7b-instruct": {
"model_id": "tiiuae/falcon-7b-instruct",
"input": "What is Deep Learning?",
"expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
"expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
],
},
"microsoft/phi-1_5": {
"model_id": "microsoft/phi-1_5",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
"expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
],
},
"openai-community/gpt2": {
"model_id": "openai-community/gpt2",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
"expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
],
},
"facebook/opt-125m": {
"model_id": "facebook/opt-125m",
"input": "What is Deep Learning?",
"expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
"expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
],
},
"EleutherAI/gpt-j-6b": {
"model_id": "EleutherAI/gpt-j-6b",
"input": "What is Deep Learning?",
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
"expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
],
},
}
print(f"Testing {len(TEST_CONFIGS)} models")
@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
def test_config(request) -> Dict[str, Any]:
"""Fixture that provides model configurations for testing."""
test_config = TEST_CONFIGS[request.param]
test_config["test_name"] = request.param
return test_config
@pytest.fixture(scope="module")
def model_id(test_config):
yield test_config["model_id"]
@pytest.fixture(scope="module")
def test_name(test_config):
yield test_config["test_name"]
@pytest.fixture(scope="module")
def expected_outputs(test_config):
return {
"greedy": test_config["expected_greedy_output"],
# "sampling": model_config["expected_sampling_output"],
"batch": test_config["expected_batch_output"],
}
@pytest.fixture(scope="module")
def input(test_config):
return test_config["input"]
@pytest.fixture(scope="module")
def tgi_service(launcher, model_id, test_name):
with launcher(model_id, test_name) as tgi_service:
yield tgi_service
@pytest.fixture(scope="module")
async def tgi_client(tgi_service) -> AsyncClient:
await tgi_service.health(1000)
return tgi_service.client
@pytest.mark.asyncio
async def test_model_single_request(
tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
):
# Bounded greedy decoding without input
response = await tgi_client.generate(
input,
max_new_tokens=32,
)
assert response.details.generated_tokens == 32
assert response.generated_text == expected_outputs["greedy"]
@pytest.mark.asyncio
async def test_model_multiple_requests(
tgi_client, generate_load, expected_outputs, input
):
num_requests = 4
responses = await generate_load(
tgi_client,
input,
max_new_tokens=32,
n=num_requests,
)
assert len(responses) == 4
expected = expected_outputs["batch"]
for r in responses:
assert r.details.generated_tokens == 32
# Compute the similarity with the expectation using the levenshtein distance
# We should not have more than two substitutions or additions
assert levenshtein_distance(r.generated_text, expected) < 3

View File

@ -1,4 +1,8 @@
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"] pytest_plugins = [
"fixtures.neuron.service",
"fixtures.neuron.export_models",
"fixtures.gaudi.service",
]
# ruff: noqa: E402 # ruff: noqa: E402
from _pytest.fixtures import SubRequest from _pytest.fixtures import SubRequest
from huggingface_hub.inference._generated.types.chat_completion import ( from huggingface_hub.inference._generated.types.chat_completion import (
@ -47,7 +51,6 @@ from text_generation.types import (
ChatComplete, ChatComplete,
ChatCompletionChunk, ChatCompletionChunk,
ChatCompletionComplete, ChatCompletionComplete,
Completion,
Details, Details,
Grammar, Grammar,
InputToken, InputToken,
@ -68,6 +71,9 @@ def pytest_addoption(parser):
parser.addoption( parser.addoption(
"--neuron", action="store_true", default=False, help="run neuron tests" "--neuron", action="store_true", default=False, help="run neuron tests"
) )
parser.addoption(
"--gaudi", action="store_true", default=False, help="run gaudi tests"
)
def pytest_configure(config): def pytest_configure(config):
@ -84,6 +90,22 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.skip(reason="need --release option to run")) item.add_marker(pytest.mark.skip(reason="need --release option to run"))
selectors.append(skip_release) selectors.append(skip_release)
if config.getoption("--gaudi"):
def skip_not_gaudi(item):
if "gaudi" not in item.keywords:
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
selectors.append(skip_not_gaudi)
else:
def skip_gaudi(item):
if "gaudi" in item.keywords:
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
selectors.append(skip_gaudi)
if config.getoption("--neuron"): if config.getoption("--neuron"):
def skip_not_neuron(item): def skip_not_neuron(item):
@ -100,6 +122,7 @@ def pytest_collection_modifyitems(config, items):
item.add_marker(pytest.mark.skip(reason="requires --neuron to run")) item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
selectors.append(skip_neuron) selectors.append(skip_neuron)
for item in items: for item in items:
for selector in selectors: for selector in selectors:
selector(item) selector(item)
@ -131,7 +154,6 @@ class ResponseComparator(JSONSnapshotExtension):
or isinstance(data, ChatComplete) or isinstance(data, ChatComplete)
or isinstance(data, ChatCompletionChunk) or isinstance(data, ChatCompletionChunk)
or isinstance(data, ChatCompletionComplete) or isinstance(data, ChatCompletionComplete)
or isinstance(data, Completion)
or isinstance(data, OAIChatCompletionChunk) or isinstance(data, OAIChatCompletionChunk)
or isinstance(data, OAICompletion) or isinstance(data, OAICompletion)
): ):
@ -188,8 +210,6 @@ class ResponseComparator(JSONSnapshotExtension):
if isinstance(choices, List) and len(choices) >= 1: if isinstance(choices, List) and len(choices) >= 1:
if "delta" in choices[0]: if "delta" in choices[0]:
return ChatCompletionChunk(**data) return ChatCompletionChunk(**data)
if "text" in choices[0]:
return Completion(**data)
return ChatComplete(**data) return ChatComplete(**data)
else: else:
return Response(**data) return Response(**data)
@ -282,9 +302,6 @@ class ResponseComparator(JSONSnapshotExtension):
) )
) )
def eq_completion(response: Completion, other: Completion) -> bool:
return response.choices[0].text == other.choices[0].text
def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool: def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
return ( return (
response.choices[0].message.content == other.choices[0].message.content response.choices[0].message.content == other.choices[0].message.content
@ -329,11 +346,6 @@ class ResponseComparator(JSONSnapshotExtension):
if len(serialized_data) == 0: if len(serialized_data) == 0:
return len(snapshot_data) == len(serialized_data) return len(snapshot_data) == len(serialized_data)
if isinstance(serialized_data[0], Completion):
return len(snapshot_data) == len(serialized_data) and all(
[eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
)
if isinstance(serialized_data[0], ChatComplete): if isinstance(serialized_data[0], ChatComplete):
return len(snapshot_data) == len(serialized_data) and all( return len(snapshot_data) == len(serialized_data) and all(
[eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)] [eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]

View File

@ -14,15 +14,23 @@ import docker
import pytest import pytest
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
from docker.errors import NotFound from docker.errors import NotFound
from loguru import logger import logging
from test_model import TEST_CONFIGS from gaudi.test_gaudi_generate import TEST_CONFIGS
from text_generation import AsyncClient from text_generation import AsyncClient
from text_generation.types import Response from text_generation.types import Response
import huggingface_hub
logging.basicConfig(
level=logging.INFO,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
stream=sys.stdout,
)
logger = logging.getLogger(__file__)
# Use the latest image from the local docker build # Use the latest image from the local docker build
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi") DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None) DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
HF_TOKEN = os.getenv("HF_TOKEN", None) HF_TOKEN = huggingface_hub.get_token()
assert ( assert (
HF_TOKEN is not None HF_TOKEN is not None
@ -48,12 +56,6 @@ HABANA_RUN_ARGS = {
"cap_add": ["sys_nice"], "cap_add": ["sys_nice"],
} }
logger.add(
sys.stderr,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
level="INFO",
)
def stream_container_logs(container, test_name): def stream_container_logs(container, test_name):
"""Stream container logs in a separate thread.""" """Stream container logs in a separate thread."""
@ -151,7 +153,7 @@ def data_volume():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def launcher(data_volume): def gaudi_launcher(event_loop):
@contextlib.contextmanager @contextlib.contextmanager
def docker_launcher( def docker_launcher(
model_id: str, model_id: str,
@ -271,7 +273,7 @@ def launcher(data_volume):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def generate_load(): def gaudi_generate_load():
async def generate_load_inner( async def generate_load_inner(
client: AsyncClient, prompt: str, max_new_tokens: int, n: int client: AsyncClient, prompt: str, max_new_tokens: int, n: int
) -> List[Response]: ) -> List[Response]:

View File

@ -3,7 +3,7 @@ import os
from typing import Dict, Any, Generator from typing import Dict, Any, Generator
import pytest import pytest
from test_model import TEST_CONFIGS from test_generate import TEST_CONFIGS
UNKNOWN_CONFIGS = { UNKNOWN_CONFIGS = {
name: config name: config

View File

@ -0,0 +1,273 @@
from typing import Any, Dict
from text_generation import AsyncClient
import pytest
# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
TEST_CONFIGS = {
# "meta-llama/Llama-3.1-8B-Instruct-shared": {
# "model_id": "meta-llama/Llama-3.1-8B-Instruct",
# "input": "What is Deep Learning?",
# "expected_greedy_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
# "expected_batch_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
# "args": [
# "--sharded",
# "true",
# "--num-shard",
# "8",
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "8",
# "--max-batch-prefill-tokens",
# "2048",
# ],
# },
"meta-llama/Llama-3.1-8B-Instruct": {
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"input": "What is Deep Learning?",
"expected_greedy_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
"expected_batch_output": " A Beginners Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
"env_config": {},
"args": [
"--max-input-tokens",
"512",
"--max-total-tokens",
"1024",
"--max-batch-size",
"4",
"--max-batch-prefill-tokens",
"2048",
],
},
# "meta-llama/Llama-2-7b-chat-hf": {
# "model_id": "meta-llama/Llama-2-7b-chat-hf",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
# "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# "--max-batch-prefill-tokens",
# "2048",
# ],
# },
# "mistralai/Mistral-7B-Instruct-v0.3": {
# "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# "--max-batch-prefill-tokens",
# "2048",
# ],
# },
# "bigcode/starcoder2-3b": {
# "model_id": "bigcode/starcoder2-3b",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# "--max-batch-prefill-tokens",
# "2048",
# ],
# },
# "google/gemma-7b-it": {
# "model_id": "google/gemma-7b-it",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# "--max-batch-prefill-tokens",
# "2048",
# ],
# },
# "Qwen/Qwen2-0.5B-Instruct": {
# "model_id": "Qwen/Qwen2-0.5B-Instruct",
# "input": "What is Deep Learning?",
# "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
# "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# "--max-batch-prefill-tokens",
# "2048",
# ],
# },
# "tiiuae/falcon-7b-instruct": {
# "model_id": "tiiuae/falcon-7b-instruct",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
# "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# ],
# },
# "microsoft/phi-1_5": {
# "model_id": "microsoft/phi-1_5",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
# "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# ],
# },
# "openai-community/gpt2": {
# "model_id": "openai-community/gpt2",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
# "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# ],
# },
# "facebook/opt-125m": {
# "model_id": "facebook/opt-125m",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
# "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# ],
# },
# "EleutherAI/gpt-j-6b": {
# "model_id": "EleutherAI/gpt-j-6b",
# "input": "What is Deep Learning?",
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
# "args": [
# "--max-input-tokens",
# "512",
# "--max-total-tokens",
# "1024",
# "--max-batch-size",
# "4",
# ],
# },
}
print(f"Testing {len(TEST_CONFIGS)} models")
@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
def test_config(request) -> Dict[str, Any]:
"""Fixture that provides model configurations for testing."""
test_config = TEST_CONFIGS[request.param]
test_config["test_name"] = request.param
return test_config
@pytest.fixture(scope="module")
def model_id(test_config):
yield test_config["model_id"]
@pytest.fixture(scope="module")
def test_name(test_config):
yield test_config["test_name"]
@pytest.fixture(scope="module")
def expected_outputs(test_config):
return {
"greedy": test_config["expected_greedy_output"],
# "sampling": model_config["expected_sampling_output"],
"batch": test_config["expected_batch_output"],
}
@pytest.fixture(scope="module")
def input(test_config):
return test_config["input"]
@pytest.fixture(scope="module")
def tgi_service(gaudi_launcher, model_id, test_name):
with gaudi_launcher(model_id, test_name) as tgi_service:
yield tgi_service
@pytest.fixture(scope="module")
async def tgi_client(tgi_service) -> AsyncClient:
await tgi_service.health(1000)
return tgi_service.client
@pytest.mark.asyncio
async def test_model_single_request(
tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
):
# Bounded greedy decoding without input
response = await tgi_client.generate(
input,
max_new_tokens=32,
)
assert response.details.generated_tokens == 32
assert response.generated_text == expected_outputs["greedy"]
@pytest.mark.asyncio
async def test_model_multiple_requests(
tgi_client, gaudi_generate_load, expected_outputs, input
):
num_requests = 4
responses = await gaudi_generate_load(
tgi_client,
input,
max_new_tokens=32,
n=num_requests,
)
assert len(responses) == 4
expected = expected_outputs["batch"]
for r in responses:
assert r.details.generated_tokens == 32
assert r.generated_text == expected