From 1cd3f98ff737b0d1f6ad865e47ab272b85da6a13 Mon Sep 17 00:00:00 2001 From: baptiste Date: Thu, 10 Apr 2025 08:32:37 +0000 Subject: [PATCH] feat(ci): llama3 test working --- .../gaudi/test_gaudi_generate.py | 273 ++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 integration-tests/gaudi/test_gaudi_generate.py diff --git a/integration-tests/gaudi/test_gaudi_generate.py b/integration-tests/gaudi/test_gaudi_generate.py new file mode 100644 index 00000000..423ac17f --- /dev/null +++ b/integration-tests/gaudi/test_gaudi_generate.py @@ -0,0 +1,273 @@ +from typing import Any, Dict + +from text_generation import AsyncClient +import pytest + +# The "args" config is not optimized for speed but only check that the inference is working for the different models architectures +TEST_CONFIGS = { + "meta-llama/Llama-3.1-8B-Instruct-shared": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "input": "What is Deep Learning?", + "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use", + "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use", + "args": [ + "--sharded", + "true", + "--num-shard", + "8", + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "8", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "meta-llama/Llama-3.1-8B-Instruct": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "input": "What is Deep Learning?", + "expected_greedy_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", + "expected_batch_output": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use of artificial neural networks to analyze and interpret data. It is a type of", + "env_config": {}, + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "meta-llama/Llama-2-7b-chat-hf": { + "model_id": "meta-llama/Llama-2-7b-chat-hf", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", + "expected_batch_output": "\n\nDeep learning (also known as deep structured learning) is part of a broader family of machine learning techniques based on artificial neural networks\u2014specific", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "mistralai/Mistral-7B-Instruct-v0.3": { + "model_id": "mistralai/Mistral-7B-Instruct-v0.3", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", + "expected_batch_output": "\n\nDeep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "bigcode/starcoder2-3b": { + "model_id": "bigcode/starcoder2-3b", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", + "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to perform tasks.\n\nNeural networks are a type of machine learning algorithm that", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "google/gemma-7b-it": { + "model_id": "google/gemma-7b-it", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", + "expected_batch_output": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to learn from large amounts of data. Neural networks are inspired by the structure and function of", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "Qwen/Qwen2-0.5B-Instruct": { + "model_id": "Qwen/Qwen2-0.5B-Instruct", + "input": "What is Deep Learning?", + "expected_greedy_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", + "expected_batch_output": " Deep Learning is a type of machine learning that is based on the principles of artificial neural networks. It is a type of machine learning that is used to train models", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + "--max-batch-prefill-tokens", + "2048", + ], + }, + "tiiuae/falcon-7b-instruct": { + "model_id": "tiiuae/falcon-7b-instruct", + "input": "What is Deep Learning?", + "expected_greedy_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", + "expected_batch_output": "\nDeep learning is a branch of machine learning that uses artificial neural networks to learn and make decisions. It is based on the concept of hierarchical learning, where a", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + ], + }, + "microsoft/phi-1_5": { + "model_id": "microsoft/phi-1_5", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", + "expected_batch_output": "\n\nDeep Learning is a subfield of Machine Learning that focuses on building neural networks with multiple layers of interconnected nodes. These networks are designed to learn from large", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + ], + }, + "openai-community/gpt2": { + "model_id": "openai-community/gpt2", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", + "expected_batch_output": "\n\nDeep learning is a new field of research that has been around for a long time. It is a new field of research that has been around for a", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + ], + }, + "facebook/opt-125m": { + "model_id": "facebook/opt-125m", + "input": "What is Deep Learning?", + "expected_greedy_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout", + "expected_batch_output": "\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout the Author\n\nAbout", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + ], + }, + "EleutherAI/gpt-j-6b": { + "model_id": "EleutherAI/gpt-j-6b", + "input": "What is Deep Learning?", + "expected_greedy_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", + "expected_batch_output": "\n\nDeep learning is a subset of machine learning that is based on the idea of neural networks. Neural networks are a type of artificial intelligence that is inspired by", + "args": [ + "--max-input-tokens", + "512", + "--max-total-tokens", + "1024", + "--max-batch-size", + "4", + ], + }, +} + +print(f"Testing {len(TEST_CONFIGS)} models") + + +@pytest.fixture(scope="module", params=TEST_CONFIGS.keys()) +def test_config(request) -> Dict[str, Any]: + """Fixture that provides model configurations for testing.""" + test_config = TEST_CONFIGS[request.param] + test_config["test_name"] = request.param + return test_config + + +@pytest.fixture(scope="module") +def model_id(test_config): + yield test_config["model_id"] + + +@pytest.fixture(scope="module") +def test_name(test_config): + yield test_config["test_name"] + + +@pytest.fixture(scope="module") +def expected_outputs(test_config): + return { + "greedy": test_config["expected_greedy_output"], + # "sampling": model_config["expected_sampling_output"], + "batch": test_config["expected_batch_output"], + } + + +@pytest.fixture(scope="module") +def input(test_config): + return test_config["input"] + + +@pytest.fixture(scope="module") +def tgi_service(gaudi_launcher, model_id, test_name): + with gaudi_launcher(model_id, test_name) as tgi_service: + yield tgi_service + + +@pytest.fixture(scope="module") +async def tgi_client(tgi_service) -> AsyncClient: + await tgi_service.health(1000) + return tgi_service.client + + +@pytest.mark.asyncio +async def test_model_single_request( + tgi_client: AsyncClient, expected_outputs: Dict[str, Any], input: str +): + # Bounded greedy decoding without input + response = await tgi_client.generate( + input, + max_new_tokens=32, + ) + assert response.details.generated_tokens == 32 + assert response.generated_text == expected_outputs["greedy"] + + +@pytest.mark.asyncio +async def test_model_multiple_requests( + tgi_client, gaudi_generate_load, expected_outputs, input +): + num_requests = 4 + responses = await gaudi_generate_load( + tgi_client, + input, + max_new_tokens=32, + n=num_requests, + ) + + assert len(responses) == 4 + expected = expected_outputs["batch"] + for r in responses: + assert r.details.generated_tokens == 32 + assert r.generated_text == expected