mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 13:52:07 +00:00
Avoid running neuron integration tests twice (#3054)
* test(neuron): refactor to prepare batch export * test(neuron): add helper to batch export models Also rename fixture file fro clarity. * ci(neuron): do not run tests twice * ci(neuron): rename precompilation job * test(neuron): remove redundant subdirectory * test(neuron): remove erroneous line * doc(neuron): update links to installation page * feat(neuron): cleanup Dockerfile CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse is not required anymore. * test(neuron): try to reduce download errors
This commit is contained in:
parent
b0069e0485
commit
5eec3a8bb6
13
.github/workflows/build.yaml
vendored
13
.github/workflows/build.yaml
vendored
@ -230,7 +230,7 @@ jobs:
|
||||
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
|
||||
echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
|
||||
echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
|
||||
precompile_static_models:
|
||||
precompile_neuron_models:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
@ -252,21 +252,18 @@ jobs:
|
||||
- name: Install
|
||||
run: |
|
||||
make install-integration-tests
|
||||
- name: Run tests
|
||||
- name: Export neuron models
|
||||
run: |
|
||||
export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
|
||||
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
||||
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
||||
export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
|
||||
export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
|
||||
echo $DOCKER_IMAGE
|
||||
docker pull $DOCKER_IMAGE
|
||||
pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
|
||||
export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
|
||||
python integration-tests/fixtures/neuron/export_models.py
|
||||
integration_tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
needs: [precompile_static_models, build-and-push]
|
||||
needs: [precompile_neuron_models, build-and-push]
|
||||
if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
|
||||
runs-on:
|
||||
group: ${{ needs.build-and-push.outputs.runs_on }}
|
||||
|
@ -24,8 +24,6 @@ RUN cargo install cargo-chef --locked
|
||||
|
||||
WORKDIR /usr/src
|
||||
|
||||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
||||
|
||||
FROM chef AS planner
|
||||
COPY backends/neuron/Cargo.toml Cargo.toml
|
||||
COPY Cargo.lock Cargo.lock
|
||||
|
@ -12,7 +12,7 @@
|
||||
- local: installation_gaudi
|
||||
title: Using TGI with Intel Gaudi
|
||||
- local: installation_inferentia
|
||||
title: Using TGI with AWS Inferentia
|
||||
title: Using TGI with AWS Trainium and Inferentia
|
||||
- local: installation_tpu
|
||||
title: Using TGI with Google TPUs
|
||||
- local: installation_intel
|
||||
|
@ -107,7 +107,7 @@ Several variants of the model server exist that are actively supported by Huggin
|
||||
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
|
||||
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
|
||||
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
|
||||
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
|
||||
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ.
|
||||
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
|
||||
|
||||
Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
|
||||
|
@ -1,3 +1,3 @@
|
||||
# Using TGI with Inferentia
|
||||
|
||||
Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
|
||||
You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron).
|
||||
|
@ -13,3 +13,4 @@ TGI remains consistent across backends, allowing you to switch between them seam
|
||||
However, it requires a model-specific compilation step for each GPU architecture.
|
||||
* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
|
||||
(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
|
||||
* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/).
|
||||
|
@ -1,4 +1,4 @@
|
||||
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"]
|
||||
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
|
||||
# ruff: noqa: E402
|
||||
from _pytest.fixtures import SubRequest
|
||||
import requests
|
||||
|
@ -118,10 +118,11 @@ def get_tgi_docker_image():
|
||||
return docker_image
|
||||
|
||||
|
||||
def export_model(config_name, model_config, neuron_model_name):
|
||||
"""Export a neuron model.
|
||||
def maybe_export_model(config_name, model_config):
|
||||
"""Export a neuron model for the specified test configuration.
|
||||
|
||||
The model is exported by a custom image built on the fly from the base TGI image.
|
||||
If the neuron model has not already been compiled and pushed to the hub, it is
|
||||
exported by a custom image built on the fly from the base TGI image.
|
||||
This makes sure the exported model and image are aligned and avoids introducing
|
||||
neuron specific imports in the test suite.
|
||||
|
||||
@ -130,9 +131,15 @@ def export_model(config_name, model_config, neuron_model_name):
|
||||
Used to identify test configurations
|
||||
model_config (`str`):
|
||||
The model configuration for export (includes the original model id)
|
||||
neuron_model_name (`str`):
|
||||
The name of the exported model on the hub
|
||||
"""
|
||||
neuron_model_name = get_neuron_model_name(config_name)
|
||||
neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
|
||||
hub = huggingface_hub.HfApi()
|
||||
if hub.repo_exists(neuron_model_id):
|
||||
logger.info(
|
||||
f"Skipping model export for config {config_name} as {neuron_model_id} already exists"
|
||||
)
|
||||
return neuron_model_id
|
||||
|
||||
client = docker.from_env()
|
||||
|
||||
@ -183,7 +190,7 @@ def export_model(config_name, model_config, neuron_model_name):
|
||||
logger.debug("Build logs %s", logs)
|
||||
|
||||
try:
|
||||
container = client.containers.run(
|
||||
client.containers.run(
|
||||
export_image,
|
||||
environment=env,
|
||||
auto_remove=True,
|
||||
@ -192,7 +199,6 @@ def export_model(config_name, model_config, neuron_model_name):
|
||||
shm_size="1G",
|
||||
)
|
||||
logger.info(f"Successfully exported model for config {config_name}")
|
||||
container.logs()
|
||||
except Exception as e:
|
||||
logger.exception(f"An exception occurred while running container: {e}.")
|
||||
pass
|
||||
@ -206,6 +212,12 @@ def export_model(config_name, model_config, neuron_model_name):
|
||||
except Exception as e:
|
||||
logger.error("Error while removing image %s, skipping", image.id)
|
||||
logger.exception(e)
|
||||
return neuron_model_id
|
||||
|
||||
|
||||
def maybe_export_models():
|
||||
for config_name, model_config in MODEL_CONFIGURATIONS.items():
|
||||
maybe_export_model(config_name, model_config)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
|
||||
@ -232,15 +244,14 @@ def neuron_model_config(request):
|
||||
"""
|
||||
config_name = request.param
|
||||
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
|
||||
neuron_model_name = get_neuron_model_name(config_name)
|
||||
neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
|
||||
# Export the model first (only if needed)
|
||||
neuron_model_id = maybe_export_model(config_name, model_config)
|
||||
with TemporaryDirectory() as neuron_model_path:
|
||||
hub = huggingface_hub.HfApi()
|
||||
if not hub.repo_exists(neuron_model_id):
|
||||
# Export the model first
|
||||
export_model(config_name, model_config, neuron_model_name)
|
||||
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
|
||||
hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
|
||||
hub = huggingface_hub.HfApi()
|
||||
hub.snapshot_download(
|
||||
neuron_model_id, etag_timeout=30, local_dir=neuron_model_path
|
||||
)
|
||||
# Add dynamic parameters to the model configuration
|
||||
model_config["neuron_model_path"] = neuron_model_path
|
||||
model_config["neuron_model_id"] = neuron_model_id
|
||||
@ -257,3 +268,7 @@ def neuron_model_config(request):
|
||||
@pytest.fixture(scope="module")
|
||||
def neuron_model_path(neuron_model_config):
|
||||
yield neuron_model_config["neuron_model_path"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
maybe_export_models()
|
Loading…
Reference in New Issue
Block a user