mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-23 07:52:06 +00:00
test(neuron): merge integration tests and fixtures
This commit is contained in:
parent
68e1c608f6
commit
a3dcdab706
@ -33,7 +33,3 @@ install_server:
|
|||||||
test_server: install_server
|
test_server: install_server
|
||||||
python -m pip install -r ${mkfile_dir}/tests/requirements.txt
|
python -m pip install -r ${mkfile_dir}/tests/requirements.txt
|
||||||
python -m pytest -sv ${mkfile_dir}/tests/server
|
python -m pytest -sv ${mkfile_dir}/tests/server
|
||||||
|
|
||||||
test_integration: image
|
|
||||||
python -m pip install -r ${mkfile_dir}/tests/requirements.txt
|
|
||||||
python -m pytest -sv ${mkfile_dir}/tests/integration
|
|
||||||
|
@ -1 +1 @@
|
|||||||
pytest_plugins = ["fixtures.service", "fixtures.model"]
|
pytest_plugins = ["fixtures.model"]
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"]
|
||||||
# ruff: noqa: E402
|
# ruff: noqa: E402
|
||||||
from _pytest.fixtures import SubRequest
|
from _pytest.fixtures import SubRequest
|
||||||
import requests
|
import requests
|
||||||
|
129
integration-tests/fixtures/neuron/model.py
Normal file
129
integration-tests/fixtures/neuron/model.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
import huggingface_hub
|
||||||
|
import pytest
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from optimum.neuron import NeuronModelForCausalLM
|
||||||
|
from optimum.neuron.utils import synchronize_hub_cache
|
||||||
|
from optimum.neuron.version import __sdk_version__ as sdk_version
|
||||||
|
from optimum.neuron.version import __version__ as version
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
|
||||||
|
stream=sys.stdout,
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__file__)
|
||||||
|
|
||||||
|
OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
|
||||||
|
|
||||||
|
# All model configurations below will be added to the neuron_model_config fixture
|
||||||
|
MODEL_CONFIGURATIONS = {
|
||||||
|
"gpt2": {
|
||||||
|
"model_id": "gpt2",
|
||||||
|
"export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
|
||||||
|
},
|
||||||
|
"llama": {
|
||||||
|
"model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
|
||||||
|
"export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
|
||||||
|
},
|
||||||
|
"mistral": {
|
||||||
|
"model_id": "optimum/mistral-1.1b-testing",
|
||||||
|
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
|
||||||
|
},
|
||||||
|
"qwen2": {
|
||||||
|
"model_id": "Qwen/Qwen2.5-0.5B",
|
||||||
|
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
|
||||||
|
},
|
||||||
|
"granite": {
|
||||||
|
"model_id": "ibm-granite/granite-3.1-2b-instruct",
|
||||||
|
"export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_hub_neuron_model_id(config_name: str):
|
||||||
|
return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
|
||||||
|
|
||||||
|
|
||||||
|
def export_model(model_id, export_kwargs, neuron_model_path):
|
||||||
|
export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
|
||||||
|
for kwarg, value in export_kwargs.items():
|
||||||
|
export_command.append(f"--{kwarg}")
|
||||||
|
export_command.append(str(value))
|
||||||
|
export_command.append(neuron_model_path)
|
||||||
|
logger.info(f"Exporting {model_id} with {export_kwargs}")
|
||||||
|
try:
|
||||||
|
subprocess.run(export_command, check=True)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise ValueError(f"Failed to export model: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
|
||||||
|
def neuron_model_config(request):
|
||||||
|
"""Expose a pre-trained neuron model
|
||||||
|
|
||||||
|
The fixture first makes sure the following model artifacts are present on the hub:
|
||||||
|
- exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
|
||||||
|
- cached artifacts under optimum-internal-testing/neuron-testing-cache.
|
||||||
|
If not, it will export the model and push it to the hub.
|
||||||
|
|
||||||
|
It then fetches the model locally and return a dictionary containing:
|
||||||
|
- a configuration name,
|
||||||
|
- the original model id,
|
||||||
|
- the export parameters,
|
||||||
|
- the neuron model id,
|
||||||
|
- the neuron model local path.
|
||||||
|
|
||||||
|
For each exposed model, the local directory is maintained for the duration of the
|
||||||
|
test session and cleaned up afterwards.
|
||||||
|
The hub model artifacts are never cleaned up and persist accross sessions.
|
||||||
|
They must be cleaned up manually when the optimum-neuron version changes.
|
||||||
|
|
||||||
|
"""
|
||||||
|
config_name = request.param
|
||||||
|
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
|
||||||
|
model_id = model_config["model_id"]
|
||||||
|
export_kwargs = model_config["export_kwargs"]
|
||||||
|
neuron_model_id = get_hub_neuron_model_id(config_name)
|
||||||
|
with TemporaryDirectory() as neuron_model_path:
|
||||||
|
hub = huggingface_hub.HfApi()
|
||||||
|
if hub.repo_exists(neuron_model_id):
|
||||||
|
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
|
||||||
|
hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
|
||||||
|
else:
|
||||||
|
export_model(model_id, export_kwargs, neuron_model_path)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||||
|
tokenizer.save_pretrained(neuron_model_path)
|
||||||
|
del tokenizer
|
||||||
|
# Create the test model on the hub
|
||||||
|
hub.create_repo(neuron_model_id, private=True)
|
||||||
|
hub.upload_folder(
|
||||||
|
folder_path=neuron_model_path,
|
||||||
|
repo_id=neuron_model_id,
|
||||||
|
ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
|
||||||
|
)
|
||||||
|
# Make sure it is cached
|
||||||
|
synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
|
||||||
|
# Add dynamic parameters to the model configuration
|
||||||
|
model_config["neuron_model_path"] = neuron_model_path
|
||||||
|
model_config["neuron_model_id"] = neuron_model_id
|
||||||
|
# Also add model configuration name to allow tests to adapt their expectations
|
||||||
|
model_config["name"] = config_name
|
||||||
|
# Yield instead of returning to keep a reference to the temporary directory.
|
||||||
|
# It will go out of scope and be released only once all tests needing the fixture
|
||||||
|
# have been completed.
|
||||||
|
logger.info(f"{config_name} ready for testing ...")
|
||||||
|
yield model_config
|
||||||
|
logger.info(f"Done with {config_name}")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def neuron_model_path(neuron_model_config):
|
||||||
|
yield neuron_model_config["neuron_model_path"]
|
@ -18,9 +18,20 @@ from huggingface_hub import AsyncInferenceClient, TextGenerationOutput
|
|||||||
|
|
||||||
|
|
||||||
OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
|
OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
|
||||||
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "text-generation-inference:latest-neuron")
|
|
||||||
HF_TOKEN = huggingface_hub.get_token()
|
HF_TOKEN = huggingface_hub.get_token()
|
||||||
|
|
||||||
|
|
||||||
|
def get_tgi_docker_image():
|
||||||
|
docker_image = os.getenv("DOCKER_IMAGE", None)
|
||||||
|
if docker_image is None:
|
||||||
|
client = docker.from_env()
|
||||||
|
images = client.images.list(filters={"reference": "text-generation-inference"})
|
||||||
|
if not images:
|
||||||
|
raise ValueError("No text-generation-inference image found on this host to run tests.")
|
||||||
|
docker_image = images[0].tags[0]
|
||||||
|
return docker_image
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
|
format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
|
||||||
@ -83,7 +94,7 @@ def event_loop():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def launcher(event_loop):
|
def neuron_launcher(event_loop):
|
||||||
"""Utility fixture to expose a TGI service.
|
"""Utility fixture to expose a TGI service.
|
||||||
|
|
||||||
The fixture uses a single event loop for each module, but it can create multiple
|
The fixture uses a single event loop for each module, but it can create multiple
|
||||||
@ -130,15 +141,16 @@ def launcher(event_loop):
|
|||||||
if var in os.environ:
|
if var in os.environ:
|
||||||
env[var] = os.environ[var]
|
env[var] = os.environ[var]
|
||||||
|
|
||||||
|
base_image = get_tgi_docker_image()
|
||||||
if os.path.isdir(model_name_or_path):
|
if os.path.isdir(model_name_or_path):
|
||||||
# Create a sub-image containing the model to workaround docker dind issues preventing
|
# Create a sub-image containing the model to workaround docker dind issues preventing
|
||||||
# to share a volume from the container running tests
|
# to share a volume from the container running tests
|
||||||
|
|
||||||
docker_tag = f"{container_name}-img"
|
test_image = f"{container_name}-img"
|
||||||
logger.info(
|
logger.info(
|
||||||
"Building image on the flight derivated from %s, tagged with %s",
|
"Building image on the flight derivated from %s, tagged with %s",
|
||||||
DOCKER_IMAGE,
|
base_image,
|
||||||
docker_tag,
|
test_image,
|
||||||
)
|
)
|
||||||
with tempfile.TemporaryDirectory() as context_dir:
|
with tempfile.TemporaryDirectory() as context_dir:
|
||||||
# Copy model directory to build context
|
# Copy model directory to build context
|
||||||
@ -147,17 +159,17 @@ def launcher(event_loop):
|
|||||||
# Create Dockerfile
|
# Create Dockerfile
|
||||||
container_model_id = f"/data/{model_name_or_path}"
|
container_model_id = f"/data/{model_name_or_path}"
|
||||||
docker_content = f"""
|
docker_content = f"""
|
||||||
FROM {DOCKER_IMAGE}
|
FROM {base_image}
|
||||||
COPY model {container_model_id}
|
COPY model {container_model_id}
|
||||||
"""
|
"""
|
||||||
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
|
with open(os.path.join(context_dir, "Dockerfile"), "wb") as f:
|
||||||
f.write(docker_content.encode("utf-8"))
|
f.write(docker_content.encode("utf-8"))
|
||||||
f.flush()
|
f.flush()
|
||||||
image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=docker_tag)
|
image, logs = client.images.build(path=context_dir, dockerfile=f.name, tag=test_image)
|
||||||
logger.info("Successfully built image %s", image.id)
|
logger.info("Successfully built image %s", image.id)
|
||||||
logger.debug("Build logs %s", logs)
|
logger.debug("Build logs %s", logs)
|
||||||
else:
|
else:
|
||||||
docker_tag = DOCKER_IMAGE
|
test_image = base_image
|
||||||
image = None
|
image = None
|
||||||
container_model_id = model_name_or_path
|
container_model_id = model_name_or_path
|
||||||
|
|
||||||
@ -167,7 +179,7 @@ def launcher(event_loop):
|
|||||||
args.append("--trust-remote-code")
|
args.append("--trust-remote-code")
|
||||||
|
|
||||||
container = client.containers.run(
|
container = client.containers.run(
|
||||||
docker_tag,
|
test_image,
|
||||||
command=args,
|
command=args,
|
||||||
name=container_name,
|
name=container_name,
|
||||||
environment=env,
|
environment=env,
|
||||||
@ -210,7 +222,7 @@ def launcher(event_loop):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def generate_load():
|
def neuron_generate_load():
|
||||||
"""A utility fixture to launch multiple asynchronous TGI requests in parallel
|
"""A utility fixture to launch multiple asynchronous TGI requests in parallel
|
||||||
|
|
||||||
Args:
|
Args:
|
@ -3,10 +3,10 @@ import pytest
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
async def tgi_service(launcher, neuron_model_config):
|
async def tgi_service(neuron_launcher, neuron_model_config):
|
||||||
model_name_or_path = neuron_model_config["neuron_model_path"]
|
model_name_or_path = neuron_model_config["neuron_model_path"]
|
||||||
service_name = neuron_model_config["name"]
|
service_name = neuron_model_config["name"]
|
||||||
with launcher(service_name, model_name_or_path) as tgi_service:
|
with neuron_launcher(service_name, model_name_or_path) as tgi_service:
|
||||||
await tgi_service.health(600)
|
await tgi_service.health(600)
|
||||||
yield tgi_service
|
yield tgi_service
|
||||||
|
|
||||||
@ -71,9 +71,9 @@ async def test_model_single_request(tgi_service):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_model_multiple_requests(tgi_service, generate_load):
|
async def test_model_multiple_requests(tgi_service, neuron_generate_load):
|
||||||
num_requests = 4
|
num_requests = 4
|
||||||
responses = await generate_load(
|
responses = await neuron_generate_load(
|
||||||
tgi_service.client,
|
tgi_service.client,
|
||||||
"What is Deep Learning?",
|
"What is Deep Learning?",
|
||||||
max_new_tokens=17,
|
max_new_tokens=17,
|
@ -5,7 +5,7 @@ from huggingface_hub.errors import ValidationError
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
|
@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
|
||||||
async def tgi_service(request, launcher, neuron_model_config):
|
async def tgi_service(request, neuron_launcher, neuron_model_config):
|
||||||
"""Expose a TGI service corresponding to a model configuration
|
"""Expose a TGI service corresponding to a model configuration
|
||||||
|
|
||||||
For each model configuration, the service will be started using the following
|
For each model configuration, the service will be started using the following
|
||||||
@ -31,7 +31,7 @@ async def tgi_service(request, launcher, neuron_model_config):
|
|||||||
else:
|
else:
|
||||||
model_name_or_path = neuron_model_config["neuron_model_path"]
|
model_name_or_path = neuron_model_config["neuron_model_path"]
|
||||||
service_name = neuron_model_config["name"]
|
service_name = neuron_model_config["name"]
|
||||||
with launcher(service_name, model_name_or_path) as tgi_service:
|
with neuron_launcher(service_name, model_name_or_path) as tgi_service:
|
||||||
await tgi_service.health(600)
|
await tgi_service.health(600)
|
||||||
yield tgi_service
|
yield tgi_service
|
||||||
|
|
Loading…
Reference in New Issue
Block a user