Avoid running neuron integration tests twice (#3054)

* test(neuron): refactor to prepare batch export

* test(neuron): add helper to batch export models

Also rename fixture file fro clarity.

* ci(neuron): do not run tests twice

* ci(neuron): rename precompilation job

* test(neuron): remove redundant subdirectory

* test(neuron): remove erroneous line

* doc(neuron): update links to installation page

* feat(neuron): cleanup Dockerfile

CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse is not required anymore.

* test(neuron): try to reduce download errors
This commit is contained in:
David Corvoysier 2025-02-26 12:15:01 +01:00 committed by GitHub
parent b0069e0485
commit 5eec3a8bb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 39 additions and 28 deletions

View File

@ -230,7 +230,7 @@ jobs:
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT" echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT" echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT" echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
precompile_static_models: precompile_neuron_models:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
@ -252,21 +252,18 @@ jobs:
- name: Install - name: Install
run: | run: |
make install-integration-tests make install-integration-tests
- name: Run tests - name: Export neuron models
run: | run: |
export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }} export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
echo $DOCKER_IMAGE echo $DOCKER_IMAGE
docker pull $DOCKER_IMAGE docker pull $DOCKER_IMAGE
pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST} export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
python integration-tests/fixtures/neuron/export_models.py
integration_tests: integration_tests:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
needs: [precompile_static_models, build-and-push] needs: [precompile_neuron_models, build-and-push]
if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }} if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
runs-on: runs-on:
group: ${{ needs.build-and-push.outputs.runs_on }} group: ${{ needs.build-and-push.outputs.runs_on }}

View File

@ -24,8 +24,6 @@ RUN cargo install cargo-chef --locked
WORKDIR /usr/src WORKDIR /usr/src
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
FROM chef AS planner FROM chef AS planner
COPY backends/neuron/Cargo.toml Cargo.toml COPY backends/neuron/Cargo.toml Cargo.toml
COPY Cargo.lock Cargo.lock COPY Cargo.lock Cargo.lock

View File

@ -12,7 +12,7 @@
- local: installation_gaudi - local: installation_gaudi
title: Using TGI with Intel Gaudi title: Using TGI with Intel Gaudi
- local: installation_inferentia - local: installation_inferentia
title: Using TGI with AWS Inferentia title: Using TGI with AWS Trainium and Inferentia
- local: installation_tpu - local: installation_tpu
title: Using TGI with Google TPUs title: Using TGI with Google TPUs
- local: installation_intel - local: installation_intel

View File

@ -107,7 +107,7 @@ Several variants of the model server exist that are actively supported by Huggin
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ. - A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ. - A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi). - The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference). - A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ.
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference). - A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations. Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.

View File

@ -1,3 +1,3 @@
# Using TGI with Inferentia # Using TGI with Inferentia
Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2. You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron).

View File

@ -13,3 +13,4 @@ TGI remains consistent across backends, allowing you to switch between them seam
However, it requires a model-specific compilation step for each GPU architecture. However, it requires a model-specific compilation step for each GPU architecture.
* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models * **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation. (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/).

View File

@ -1,4 +1,4 @@
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"] pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
# ruff: noqa: E402 # ruff: noqa: E402
from _pytest.fixtures import SubRequest from _pytest.fixtures import SubRequest
import requests import requests

View File

@ -118,10 +118,11 @@ def get_tgi_docker_image():
return docker_image return docker_image
def export_model(config_name, model_config, neuron_model_name): def maybe_export_model(config_name, model_config):
"""Export a neuron model. """Export a neuron model for the specified test configuration.
The model is exported by a custom image built on the fly from the base TGI image. If the neuron model has not already been compiled and pushed to the hub, it is
exported by a custom image built on the fly from the base TGI image.
This makes sure the exported model and image are aligned and avoids introducing This makes sure the exported model and image are aligned and avoids introducing
neuron specific imports in the test suite. neuron specific imports in the test suite.
@ -130,9 +131,15 @@ def export_model(config_name, model_config, neuron_model_name):
Used to identify test configurations Used to identify test configurations
model_config (`str`): model_config (`str`):
The model configuration for export (includes the original model id) The model configuration for export (includes the original model id)
neuron_model_name (`str`):
The name of the exported model on the hub
""" """
neuron_model_name = get_neuron_model_name(config_name)
neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
hub = huggingface_hub.HfApi()
if hub.repo_exists(neuron_model_id):
logger.info(
f"Skipping model export for config {config_name} as {neuron_model_id} already exists"
)
return neuron_model_id
client = docker.from_env() client = docker.from_env()
@ -183,7 +190,7 @@ def export_model(config_name, model_config, neuron_model_name):
logger.debug("Build logs %s", logs) logger.debug("Build logs %s", logs)
try: try:
container = client.containers.run( client.containers.run(
export_image, export_image,
environment=env, environment=env,
auto_remove=True, auto_remove=True,
@ -192,7 +199,6 @@ def export_model(config_name, model_config, neuron_model_name):
shm_size="1G", shm_size="1G",
) )
logger.info(f"Successfully exported model for config {config_name}") logger.info(f"Successfully exported model for config {config_name}")
container.logs()
except Exception as e: except Exception as e:
logger.exception(f"An exception occurred while running container: {e}.") logger.exception(f"An exception occurred while running container: {e}.")
pass pass
@ -206,6 +212,12 @@ def export_model(config_name, model_config, neuron_model_name):
except Exception as e: except Exception as e:
logger.error("Error while removing image %s, skipping", image.id) logger.error("Error while removing image %s, skipping", image.id)
logger.exception(e) logger.exception(e)
return neuron_model_id
def maybe_export_models():
for config_name, model_config in MODEL_CONFIGURATIONS.items():
maybe_export_model(config_name, model_config)
@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys()) @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
@ -232,15 +244,14 @@ def neuron_model_config(request):
""" """
config_name = request.param config_name = request.param
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param]) model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
neuron_model_name = get_neuron_model_name(config_name) # Export the model first (only if needed)
neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}" neuron_model_id = maybe_export_model(config_name, model_config)
with TemporaryDirectory() as neuron_model_path: with TemporaryDirectory() as neuron_model_path:
hub = huggingface_hub.HfApi()
if not hub.repo_exists(neuron_model_id):
# Export the model first
export_model(config_name, model_config, neuron_model_name)
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub") logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path) hub = huggingface_hub.HfApi()
hub.snapshot_download(
neuron_model_id, etag_timeout=30, local_dir=neuron_model_path
)
# Add dynamic parameters to the model configuration # Add dynamic parameters to the model configuration
model_config["neuron_model_path"] = neuron_model_path model_config["neuron_model_path"] = neuron_model_path
model_config["neuron_model_id"] = neuron_model_id model_config["neuron_model_id"] = neuron_model_id
@ -257,3 +268,7 @@ def neuron_model_config(request):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def neuron_model_path(neuron_model_config): def neuron_model_path(neuron_model_config):
yield neuron_model_config["neuron_model_path"] yield neuron_model_config["neuron_model_path"]
if __name__ == "__main__":
maybe_export_models()