mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
Merge dd187d2350
into 4645678ff0
This commit is contained in:
commit
26bc3c3097
4
.github/workflows/build.yaml
vendored
4
.github/workflows/build.yaml
vendored
@ -129,9 +129,9 @@ jobs:
|
|||||||
export label_extension="-gaudi"
|
export label_extension="-gaudi"
|
||||||
export docker_volume="/mnt/cache"
|
export docker_volume="/mnt/cache"
|
||||||
export docker_devices=""
|
export docker_devices=""
|
||||||
export runs_on="ubuntu-latest"
|
export runs_on="aws-dl1-24xlarge"
|
||||||
export platform=""
|
export platform=""
|
||||||
export extra_pytest=""
|
export extra_pytest="--gaudi"
|
||||||
export target=""
|
export target=""
|
||||||
esac
|
esac
|
||||||
echo $dockerfile
|
echo $dockerfile
|
||||||
|
@ -50,10 +50,9 @@ local-dev-install: install-dependencies
|
|||||||
|
|
||||||
# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
|
# In order to run the integration tests, you need to first build the image (make -C backends/gaudi image)
|
||||||
run-integration-tests:
|
run-integration-tests:
|
||||||
uv pip install -r ${root_dir}/backends/gaudi/server/integration-tests/requirements.txt
|
|
||||||
DOCKER_VOLUME=${root_dir}/data \
|
DOCKER_VOLUME=${root_dir}/data \
|
||||||
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
|
HF_TOKEN=`cat ${HOME}/.cache/huggingface/token` \
|
||||||
uv run pytest --durations=0 -sv ${root_dir}/backends/gaudi/server/integration-tests
|
pytest --durations=0 -s -vv ${root_dir}/integration-tests --gaudi
|
||||||
|
|
||||||
# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
|
# This is used to capture the expected outputs for the integration tests offering an easy way to add more models to the integration tests
|
||||||
capture-expected-outputs-for-integration-tests:
|
capture-expected-outputs-for-integration-tests:
|
||||||
|
@ -1,2 +0,0 @@
|
|||||||
[pytest]
|
|
||||||
asyncio_mode = auto
|
|
@ -1,7 +0,0 @@
|
|||||||
pytest >= 8.3.5
|
|
||||||
pytest-asyncio >= 0.26.0
|
|
||||||
docker >= 7.1.0
|
|
||||||
Levenshtein >= 0.27.1
|
|
||||||
loguru >= 0.7.3
|
|
||||||
aiohttp >= 3.11.14
|
|
||||||
text-generation
|
|
@ -1,276 +0,0 @@
|
|||||||
from typing import Any, Dict
|
|
||||||
|
|
||||||
from text_generation import AsyncClient
|
|
||||||
import pytest
|
|
||||||
from Levenshtein import distance as levenshtein_distance
|
|
||||||
|
|
||||||
# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures
|
|
||||||
TEST_CONFIGS = {
|
|
||||||
"meta-llama/Llama-3.1-8B-Instruct-shared": {
|
|
||||||
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
|
|
||||||
"expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
|
|
||||||
"args": [
|
|
||||||
"--sharded",
|
|
||||||
"true",
|
|
||||||
"--num-shard",
|
|
||||||
"8",
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"8",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"meta-llama/Llama-3.1-8B-Instruct": {
|
|
||||||
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
|
|
||||||
"expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
|
|
||||||
"env_config": {},
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"meta-llama/Llama-2-7b-chat-hf": {
|
|
||||||
"model_id": "meta-llama/Llama-2-7b-chat-hf",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
|
|
||||||
"expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"mistralai/Mistral-7B-Instruct-v0.3": {
|
|
||||||
"model_id": "mistralai/Mistral-7B-Instruct-v0.3",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
|
|
||||||
"expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"bigcode/starcoder2-3b": {
|
|
||||||
"model_id": "bigcode/starcoder2-3b",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
|
|
||||||
"expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"google/gemma-7b-it": {
|
|
||||||
"model_id": "google/gemma-7b-it",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
|
|
||||||
"expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"Qwen/Qwen2-0.5B-Instruct": {
|
|
||||||
"model_id": "Qwen/Qwen2-0.5B-Instruct",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
|
|
||||||
"expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
"--max-batch-prefill-tokens",
|
|
||||||
"2048",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"tiiuae/falcon-7b-instruct": {
|
|
||||||
"model_id": "tiiuae/falcon-7b-instruct",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
|
|
||||||
"expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"microsoft/phi-1_5": {
|
|
||||||
"model_id": "microsoft/phi-1_5",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
|
|
||||||
"expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"openai-community/gpt2": {
|
|
||||||
"model_id": "openai-community/gpt2",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
|
|
||||||
"expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"facebook/opt-125m": {
|
|
||||||
"model_id": "facebook/opt-125m",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
|
|
||||||
"expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
"EleutherAI/gpt-j-6b": {
|
|
||||||
"model_id": "EleutherAI/gpt-j-6b",
|
|
||||||
"input": "What is Deep Learning?",
|
|
||||||
"expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
|
|
||||||
"expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
|
|
||||||
"args": [
|
|
||||||
"--max-input-tokens",
|
|
||||||
"512",
|
|
||||||
"--max-total-tokens",
|
|
||||||
"1024",
|
|
||||||
"--max-batch-size",
|
|
||||||
"4",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"Testing {len(TEST_CONFIGS)} models")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
|
|
||||||
def test_config(request) -> Dict[str, Any]:
|
|
||||||
"""Fixture that provides model configurations for testing."""
|
|
||||||
test_config = TEST_CONFIGS[request.param]
|
|
||||||
test_config["test_name"] = request.param
|
|
||||||
return test_config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def model_id(test_config):
|
|
||||||
yield test_config["model_id"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def test_name(test_config):
|
|
||||||
yield test_config["test_name"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def expected_outputs(test_config):
|
|
||||||
return {
|
|
||||||
"greedy": test_config["expected_greedy_output"],
|
|
||||||
# "sampling": model_config["expected_sampling_output"],
|
|
||||||
"batch": test_config["expected_batch_output"],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def input(test_config):
|
|
||||||
return test_config["input"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
def tgi_service(launcher, model_id, test_name):
|
|
||||||
with launcher(model_id, test_name) as tgi_service:
|
|
||||||
yield tgi_service
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
|
||||||
async def tgi_client(tgi_service) -> AsyncClient:
|
|
||||||
await tgi_service.health(1000)
|
|
||||||
return tgi_service.client
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_model_single_request(
|
|
||||||
tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
|
|
||||||
):
|
|
||||||
# Bounded greedy decoding without input
|
|
||||||
response = await tgi_client.generate(
|
|
||||||
input,
|
|
||||||
max_new_tokens=32,
|
|
||||||
)
|
|
||||||
assert response.details.generated_tokens == 32
|
|
||||||
assert response.generated_text == expected_outputs["greedy"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_model_multiple_requests(
|
|
||||||
tgi_client, generate_load, expected_outputs, input
|
|
||||||
):
|
|
||||||
num_requests = 4
|
|
||||||
responses = await generate_load(
|
|
||||||
tgi_client,
|
|
||||||
input,
|
|
||||||
max_new_tokens=32,
|
|
||||||
n=num_requests,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert len(responses) == 4
|
|
||||||
expected = expected_outputs["batch"]
|
|
||||||
for r in responses:
|
|
||||||
assert r.details.generated_tokens == 32
|
|
||||||
# Compute the similarity with the expectation using the levenshtein distance
|
|
||||||
# We should not have more than two substitutions or additions
|
|
||||||
assert levenshtein_distance(r.generated_text, expected) < 3
|
|
@ -1,4 +1,8 @@
|
|||||||
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
|
pytest_plugins = [
|
||||||
|
"fixtures.neuron.service",
|
||||||
|
"fixtures.neuron.export_models",
|
||||||
|
"fixtures.gaudi.service",
|
||||||
|
]
|
||||||
# ruff: noqa: E402
|
# ruff: noqa: E402
|
||||||
from _pytest.fixtures import SubRequest
|
from _pytest.fixtures import SubRequest
|
||||||
from huggingface_hub.inference._generated.types.chat_completion import (
|
from huggingface_hub.inference._generated.types.chat_completion import (
|
||||||
@ -47,7 +51,6 @@ from text_generation.types import (
|
|||||||
ChatComplete,
|
ChatComplete,
|
||||||
ChatCompletionChunk,
|
ChatCompletionChunk,
|
||||||
ChatCompletionComplete,
|
ChatCompletionComplete,
|
||||||
Completion,
|
|
||||||
Details,
|
Details,
|
||||||
Grammar,
|
Grammar,
|
||||||
InputToken,
|
InputToken,
|
||||||
@ -68,6 +71,9 @@ def pytest_addoption(parser):
|
|||||||
parser.addoption(
|
parser.addoption(
|
||||||
"--neuron", action="store_true", default=False, help="run neuron tests"
|
"--neuron", action="store_true", default=False, help="run neuron tests"
|
||||||
)
|
)
|
||||||
|
parser.addoption(
|
||||||
|
"--gaudi", action="store_true", default=False, help="run gaudi tests"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def pytest_configure(config):
|
def pytest_configure(config):
|
||||||
@ -84,6 +90,22 @@ def pytest_collection_modifyitems(config, items):
|
|||||||
item.add_marker(pytest.mark.skip(reason="need --release option to run"))
|
item.add_marker(pytest.mark.skip(reason="need --release option to run"))
|
||||||
|
|
||||||
selectors.append(skip_release)
|
selectors.append(skip_release)
|
||||||
|
|
||||||
|
if config.getoption("--gaudi"):
|
||||||
|
|
||||||
|
def skip_not_gaudi(item):
|
||||||
|
if "gaudi" not in item.keywords:
|
||||||
|
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
|
||||||
|
|
||||||
|
selectors.append(skip_not_gaudi)
|
||||||
|
else:
|
||||||
|
|
||||||
|
def skip_gaudi(item):
|
||||||
|
if "gaudi" in item.keywords:
|
||||||
|
item.add_marker(pytest.mark.skip(reason="requires --gaudi to run"))
|
||||||
|
|
||||||
|
selectors.append(skip_gaudi)
|
||||||
|
|
||||||
if config.getoption("--neuron"):
|
if config.getoption("--neuron"):
|
||||||
|
|
||||||
def skip_not_neuron(item):
|
def skip_not_neuron(item):
|
||||||
@ -100,6 +122,7 @@ def pytest_collection_modifyitems(config, items):
|
|||||||
item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
|
item.add_marker(pytest.mark.skip(reason="requires --neuron to run"))
|
||||||
|
|
||||||
selectors.append(skip_neuron)
|
selectors.append(skip_neuron)
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
for selector in selectors:
|
for selector in selectors:
|
||||||
selector(item)
|
selector(item)
|
||||||
@ -131,7 +154,6 @@ class ResponseComparator(JSONSnapshotExtension):
|
|||||||
or isinstance(data, ChatComplete)
|
or isinstance(data, ChatComplete)
|
||||||
or isinstance(data, ChatCompletionChunk)
|
or isinstance(data, ChatCompletionChunk)
|
||||||
or isinstance(data, ChatCompletionComplete)
|
or isinstance(data, ChatCompletionComplete)
|
||||||
or isinstance(data, Completion)
|
|
||||||
or isinstance(data, OAIChatCompletionChunk)
|
or isinstance(data, OAIChatCompletionChunk)
|
||||||
or isinstance(data, OAICompletion)
|
or isinstance(data, OAICompletion)
|
||||||
):
|
):
|
||||||
@ -188,8 +210,6 @@ class ResponseComparator(JSONSnapshotExtension):
|
|||||||
if isinstance(choices, List) and len(choices) >= 1:
|
if isinstance(choices, List) and len(choices) >= 1:
|
||||||
if "delta" in choices[0]:
|
if "delta" in choices[0]:
|
||||||
return ChatCompletionChunk(**data)
|
return ChatCompletionChunk(**data)
|
||||||
if "text" in choices[0]:
|
|
||||||
return Completion(**data)
|
|
||||||
return ChatComplete(**data)
|
return ChatComplete(**data)
|
||||||
else:
|
else:
|
||||||
return Response(**data)
|
return Response(**data)
|
||||||
@ -282,9 +302,6 @@ class ResponseComparator(JSONSnapshotExtension):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def eq_completion(response: Completion, other: Completion) -> bool:
|
|
||||||
return response.choices[0].text == other.choices[0].text
|
|
||||||
|
|
||||||
def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
|
def eq_chat_complete(response: ChatComplete, other: ChatComplete) -> bool:
|
||||||
return (
|
return (
|
||||||
response.choices[0].message.content == other.choices[0].message.content
|
response.choices[0].message.content == other.choices[0].message.content
|
||||||
@ -329,11 +346,6 @@ class ResponseComparator(JSONSnapshotExtension):
|
|||||||
if len(serialized_data) == 0:
|
if len(serialized_data) == 0:
|
||||||
return len(snapshot_data) == len(serialized_data)
|
return len(snapshot_data) == len(serialized_data)
|
||||||
|
|
||||||
if isinstance(serialized_data[0], Completion):
|
|
||||||
return len(snapshot_data) == len(serialized_data) and all(
|
|
||||||
[eq_completion(r, o) for r, o in zip(serialized_data, snapshot_data)]
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(serialized_data[0], ChatComplete):
|
if isinstance(serialized_data[0], ChatComplete):
|
||||||
return len(snapshot_data) == len(serialized_data) and all(
|
return len(snapshot_data) == len(serialized_data) and all(
|
||||||
[eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
|
[eq_chat_complete(r, o) for r, o in zip(serialized_data, snapshot_data)]
|
||||||
|
@ -14,15 +14,23 @@ import docker
|
|||||||
import pytest
|
import pytest
|
||||||
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
|
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
|
||||||
from docker.errors import NotFound
|
from docker.errors import NotFound
|
||||||
from loguru import logger
|
import logging
|
||||||
from test_model import TEST_CONFIGS
|
from gaudi.test_gaudi_generate import TEST_CONFIGS
|
||||||
from text_generation import AsyncClient
|
from text_generation import AsyncClient
|
||||||
from text_generation.types import Response
|
from text_generation.types import Response
|
||||||
|
import huggingface_hub
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
|
||||||
|
stream=sys.stdout,
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
# Use the latest image from the local docker build
|
# Use the latest image from the local docker build
|
||||||
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
|
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "tgi-gaudi")
|
||||||
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
|
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", None)
|
||||||
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
HF_TOKEN = huggingface_hub.get_token()
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
HF_TOKEN is not None
|
HF_TOKEN is not None
|
||||||
@ -48,12 +56,6 @@ HABANA_RUN_ARGS = {
|
|||||||
"cap_add": ["sys_nice"],
|
"cap_add": ["sys_nice"],
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.add(
|
|
||||||
sys.stderr,
|
|
||||||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
|
|
||||||
level="INFO",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def stream_container_logs(container, test_name):
|
def stream_container_logs(container, test_name):
|
||||||
"""Stream container logs in a separate thread."""
|
"""Stream container logs in a separate thread."""
|
||||||
@ -151,7 +153,7 @@ def data_volume():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def launcher(data_volume):
|
def gaudi_launcher(event_loop):
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def docker_launcher(
|
def docker_launcher(
|
||||||
model_id: str,
|
model_id: str,
|
||||||
@ -271,7 +273,7 @@ def launcher(data_volume):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def generate_load():
|
def gaudi_generate_load():
|
||||||
async def generate_load_inner(
|
async def generate_load_inner(
|
||||||
client: AsyncClient, prompt: str, max_new_tokens: int, n: int
|
client: AsyncClient, prompt: str, max_new_tokens: int, n: int
|
||||||
) -> List[Response]:
|
) -> List[Response]:
|
@ -3,7 +3,7 @@ import os
|
|||||||
from typing import Dict, Any, Generator
|
from typing import Dict, Any, Generator
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from test_model import TEST_CONFIGS
|
from test_generate import TEST_CONFIGS
|
||||||
|
|
||||||
UNKNOWN_CONFIGS = {
|
UNKNOWN_CONFIGS = {
|
||||||
name: config
|
name: config
|
273
integration-tests/gaudi/test_gaudi_generate.py
Normal file
273
integration-tests/gaudi/test_gaudi_generate.py
Normal file
@ -0,0 +1,273 @@
|
|||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from text_generation import AsyncClient
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# The "args" values in TEST_CONFIGS are not optimized for speed but only check that the inference is working for the different models architectures.
|
||||||
|
TEST_CONFIGS = {
|
||||||
|
# "meta-llama/Llama-3.1-8B-Instruct-shared": {
|
||||||
|
# "model_id": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
|
||||||
|
# "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
|
||||||
|
# "args": [
|
||||||
|
# "--sharded",
|
||||||
|
# "true",
|
||||||
|
# "--num-shard",
|
||||||
|
# "8",
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "8",
|
||||||
|
# "--max-batch-prefill-tokens",
|
||||||
|
# "2048",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
"meta-llama/Llama-3.1-8B-Instruct": {
|
||||||
|
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"input": "What is Deep Learning?",
|
||||||
|
"expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
|
||||||
|
"expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of",
|
||||||
|
"env_config": {},
|
||||||
|
"args": [
|
||||||
|
"--max-input-tokens",
|
||||||
|
"512",
|
||||||
|
"--max-total-tokens",
|
||||||
|
"1024",
|
||||||
|
"--max-batch-size",
|
||||||
|
"4",
|
||||||
|
"--max-batch-prefill-tokens",
|
||||||
|
"2048",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
# "meta-llama/Llama-2-7b-chat-hf": {
|
||||||
|
# "model_id": "meta-llama/Llama-2-7b-chat-hf",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
|
||||||
|
# "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# "--max-batch-prefill-tokens",
|
||||||
|
# "2048",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "mistralai/Mistral-7B-Instruct-v0.3": {
|
||||||
|
# "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
|
||||||
|
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# "--max-batch-prefill-tokens",
|
||||||
|
# "2048",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "bigcode/starcoder2-3b": {
|
||||||
|
# "model_id": "bigcode/starcoder2-3b",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
|
||||||
|
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# "--max-batch-prefill-tokens",
|
||||||
|
# "2048",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "google/gemma-7b-it": {
|
||||||
|
# "model_id": "google/gemma-7b-it",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
|
||||||
|
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# "--max-batch-prefill-tokens",
|
||||||
|
# "2048",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "Qwen/Qwen2-0.5B-Instruct": {
|
||||||
|
# "model_id": "Qwen/Qwen2-0.5B-Instruct",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
|
||||||
|
# "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# "--max-batch-prefill-tokens",
|
||||||
|
# "2048",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "tiiuae/falcon-7b-instruct": {
|
||||||
|
# "model_id": "tiiuae/falcon-7b-instruct",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
|
||||||
|
# "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "microsoft/phi-1_5": {
|
||||||
|
# "model_id": "microsoft/phi-1_5",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
|
||||||
|
# "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "openai-community/gpt2": {
|
||||||
|
# "model_id": "openai-community/gpt2",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
|
||||||
|
# "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "facebook/opt-125m": {
|
||||||
|
# "model_id": "facebook/opt-125m",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
|
||||||
|
# "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
# "EleutherAI/gpt-j-6b": {
|
||||||
|
# "model_id": "EleutherAI/gpt-j-6b",
|
||||||
|
# "input": "What is Deep Learning?",
|
||||||
|
# "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
|
||||||
|
# "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by",
|
||||||
|
# "args": [
|
||||||
|
# "--max-input-tokens",
|
||||||
|
# "512",
|
||||||
|
# "--max-total-tokens",
|
||||||
|
# "1024",
|
||||||
|
# "--max-batch-size",
|
||||||
|
# "4",
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"Testing {len(TEST_CONFIGS)} models")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module", params=TEST_CONFIGS.keys())
|
||||||
|
def test_config(request) -> Dict[str, Any]:
|
||||||
|
"""Fixture that provides model configurations for testing."""
|
||||||
|
test_config = TEST_CONFIGS[request.param]
|
||||||
|
test_config["test_name"] = request.param
|
||||||
|
return test_config
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def model_id(test_config):
|
||||||
|
yield test_config["model_id"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def test_name(test_config):
|
||||||
|
yield test_config["test_name"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def expected_outputs(test_config):
|
||||||
|
return {
|
||||||
|
"greedy": test_config["expected_greedy_output"],
|
||||||
|
# "sampling": model_config["expected_sampling_output"],
|
||||||
|
"batch": test_config["expected_batch_output"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def input(test_config):
|
||||||
|
return test_config["input"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def tgi_service(gaudi_launcher, model_id, test_name):
|
||||||
|
with gaudi_launcher(model_id, test_name) as tgi_service:
|
||||||
|
yield tgi_service
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
async def tgi_client(tgi_service) -> AsyncClient:
|
||||||
|
await tgi_service.health(1000)
|
||||||
|
return tgi_service.client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_model_single_request(
|
||||||
|
tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str
|
||||||
|
):
|
||||||
|
# Bounded greedy decoding without input
|
||||||
|
response = await tgi_client.generate(
|
||||||
|
input,
|
||||||
|
max_new_tokens=32,
|
||||||
|
)
|
||||||
|
assert response.details.generated_tokens == 32
|
||||||
|
assert response.generated_text == expected_outputs["greedy"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_model_multiple_requests(
|
||||||
|
tgi_client, gaudi_generate_load, expected_outputs, input
|
||||||
|
):
|
||||||
|
num_requests = 4
|
||||||
|
responses = await gaudi_generate_load(
|
||||||
|
tgi_client,
|
||||||
|
input,
|
||||||
|
max_new_tokens=32,
|
||||||
|
n=num_requests,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(responses) == 4
|
||||||
|
expected = expected_outputs["batch"]
|
||||||
|
for r in responses:
|
||||||
|
assert r.details.generated_tokens == 32
|
||||||
|
assert r.generated_text == expected
|
Loading…
Reference in New Issue
Block a user