mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-19 22:02:06 +00:00
Avoid running neuron integration tests twice (#3054)
* test(neuron): refactor to prepare batch export * test(neuron): add helper to batch export models Also rename fixture file fro clarity. * ci(neuron): do not run tests twice * ci(neuron): rename precompilation job * test(neuron): remove redundant subdirectory * test(neuron): remove erroneous line * doc(neuron): update links to installation page * feat(neuron): cleanup Dockerfile CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse is not required anymore. * test(neuron): try to reduce download errors
This commit is contained in:
parent
b0069e0485
commit
5eec3a8bb6
13
.github/workflows/build.yaml
vendored
13
.github/workflows/build.yaml
vendored
@ -230,7 +230,7 @@ jobs:
|
|||||||
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
|
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
|
||||||
echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
|
echo "label_extension=${{ env.LABEL_EXTENSION }}" >> "$GITHUB_OUTPUT"
|
||||||
echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
|
echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
|
||||||
precompile_static_models:
|
precompile_neuron_models:
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
@ -252,21 +252,18 @@ jobs:
|
|||||||
- name: Install
|
- name: Install
|
||||||
run: |
|
run: |
|
||||||
make install-integration-tests
|
make install-integration-tests
|
||||||
- name: Run tests
|
- name: Export neuron models
|
||||||
run: |
|
run: |
|
||||||
export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
|
|
||||||
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
||||||
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
|
||||||
export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
|
|
||||||
export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
|
|
||||||
echo $DOCKER_IMAGE
|
echo $DOCKER_IMAGE
|
||||||
docker pull $DOCKER_IMAGE
|
docker pull $DOCKER_IMAGE
|
||||||
pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
|
export HF_TOKEN=${{ secrets.HF_TOKEN_NEURON }}
|
||||||
|
python integration-tests/fixtures/neuron/export_models.py
|
||||||
integration_tests:
|
integration_tests:
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label_extension }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
needs: [precompile_static_models, build-and-push]
|
needs: [precompile_neuron_models, build-and-push]
|
||||||
if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
|
if: ${{ always() && !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && needs.build-and-push.outputs.runs_on != 'ubuntu-latest' }}
|
||||||
runs-on:
|
runs-on:
|
||||||
group: ${{ needs.build-and-push.outputs.runs_on }}
|
group: ${{ needs.build-and-push.outputs.runs_on }}
|
||||||
|
@ -24,8 +24,6 @@ RUN cargo install cargo-chef --locked
|
|||||||
|
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
|
||||||
|
|
||||||
FROM chef AS planner
|
FROM chef AS planner
|
||||||
COPY backends/neuron/Cargo.toml Cargo.toml
|
COPY backends/neuron/Cargo.toml Cargo.toml
|
||||||
COPY Cargo.lock Cargo.lock
|
COPY Cargo.lock Cargo.lock
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
- local: installation_gaudi
|
- local: installation_gaudi
|
||||||
title: Using TGI with Intel Gaudi
|
title: Using TGI with Intel Gaudi
|
||||||
- local: installation_inferentia
|
- local: installation_inferentia
|
||||||
title: Using TGI with AWS Inferentia
|
title: Using TGI with AWS Trainium and Inferentia
|
||||||
- local: installation_tpu
|
- local: installation_tpu
|
||||||
title: Using TGI with Google TPUs
|
title: Using TGI with Google TPUs
|
||||||
- local: installation_intel
|
- local: installation_intel
|
||||||
|
@ -107,7 +107,7 @@ Several variants of the model server exist that are actively supported by Huggin
|
|||||||
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
|
- A [version optimized for AMD with ROCm](https://huggingface.co/docs/text-generation-inference/installation_amd) is hosted in the main TGI repository. Some model features differ.
|
||||||
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
|
- A [version optimized for Intel GPUs](https://huggingface.co/docs/text-generation-inference/installation_intel) is hosted in the main TGI repository. Some model features differ.
|
||||||
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
|
- The [version for Intel Gaudi](https://huggingface.co/docs/text-generation-inference/installation_gaudi) is maintained on a forked repository, often resynchronized with the main [TGI repository](https://github.com/huggingface/tgi-gaudi).
|
||||||
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained as part of [Optimum Neuron](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference).
|
- A [version for Neuron (AWS Inferentia2)](https://huggingface.co/docs/text-generation-inference/installation_inferentia) is maintained in the main TGI repository. Some model features differ.
|
||||||
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
|
- A version for Google TPUs is maintained as part of [Optimum TPU](https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference).
|
||||||
|
|
||||||
Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
|
Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
# Using TGI with Inferentia
|
# Using TGI with Inferentia
|
||||||
|
|
||||||
Check out this [guide](https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference) on how to serve models with TGI on Inferentia2.
|
You can use TGI on AWS Trainium and Inferentia platforms using the [TGI neuron backend](https://huggingface.co/docs/text-generation-inference/backends/neuron).
|
||||||
|
@ -13,3 +13,4 @@ TGI remains consistent across backends, allowing you to switch between them seam
|
|||||||
However, it requires a model-specific compilation step for each GPU architecture.
|
However, it requires a model-specific compilation step for each GPU architecture.
|
||||||
* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
|
* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models
|
||||||
(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
|
(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation.
|
||||||
|
* **[TGI Neuron backend](./backends/neuron)**: This backend leverages the [AWS Neuron SDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) to allow the deployment of large language models (LLMs) on [AWS Trainium and Inferentia chips](https://aws.amazon.com/ai/machine-learning/trainium/).
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.model"]
|
pytest_plugins = ["fixtures.neuron.service", "fixtures.neuron.export_models"]
|
||||||
# ruff: noqa: E402
|
# ruff: noqa: E402
|
||||||
from _pytest.fixtures import SubRequest
|
from _pytest.fixtures import SubRequest
|
||||||
import requests
|
import requests
|
||||||
|
@ -118,10 +118,11 @@ def get_tgi_docker_image():
|
|||||||
return docker_image
|
return docker_image
|
||||||
|
|
||||||
|
|
||||||
def export_model(config_name, model_config, neuron_model_name):
|
def maybe_export_model(config_name, model_config):
|
||||||
"""Export a neuron model.
|
"""Export a neuron model for the specified test configuration.
|
||||||
|
|
||||||
The model is exported by a custom image built on the fly from the base TGI image.
|
If the neuron model has not already been compiled and pushed to the hub, it is
|
||||||
|
exported by a custom image built on the fly from the base TGI image.
|
||||||
This makes sure the exported model and image are aligned and avoids introducing
|
This makes sure the exported model and image are aligned and avoids introducing
|
||||||
neuron specific imports in the test suite.
|
neuron specific imports in the test suite.
|
||||||
|
|
||||||
@ -130,9 +131,15 @@ def export_model(config_name, model_config, neuron_model_name):
|
|||||||
Used to identify test configurations
|
Used to identify test configurations
|
||||||
model_config (`str`):
|
model_config (`str`):
|
||||||
The model configuration for export (includes the original model id)
|
The model configuration for export (includes the original model id)
|
||||||
neuron_model_name (`str`):
|
|
||||||
The name of the exported model on the hub
|
|
||||||
"""
|
"""
|
||||||
|
neuron_model_name = get_neuron_model_name(config_name)
|
||||||
|
neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
|
||||||
|
hub = huggingface_hub.HfApi()
|
||||||
|
if hub.repo_exists(neuron_model_id):
|
||||||
|
logger.info(
|
||||||
|
f"Skipping model export for config {config_name} as {neuron_model_id} already exists"
|
||||||
|
)
|
||||||
|
return neuron_model_id
|
||||||
|
|
||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
|
|
||||||
@ -183,7 +190,7 @@ def export_model(config_name, model_config, neuron_model_name):
|
|||||||
logger.debug("Build logs %s", logs)
|
logger.debug("Build logs %s", logs)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
container = client.containers.run(
|
client.containers.run(
|
||||||
export_image,
|
export_image,
|
||||||
environment=env,
|
environment=env,
|
||||||
auto_remove=True,
|
auto_remove=True,
|
||||||
@ -192,7 +199,6 @@ def export_model(config_name, model_config, neuron_model_name):
|
|||||||
shm_size="1G",
|
shm_size="1G",
|
||||||
)
|
)
|
||||||
logger.info(f"Successfully exported model for config {config_name}")
|
logger.info(f"Successfully exported model for config {config_name}")
|
||||||
container.logs()
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception(f"An exception occurred while running container: {e}.")
|
logger.exception(f"An exception occurred while running container: {e}.")
|
||||||
pass
|
pass
|
||||||
@ -206,6 +212,12 @@ def export_model(config_name, model_config, neuron_model_name):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error while removing image %s, skipping", image.id)
|
logger.error("Error while removing image %s, skipping", image.id)
|
||||||
logger.exception(e)
|
logger.exception(e)
|
||||||
|
return neuron_model_id
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_export_models():
|
||||||
|
for config_name, model_config in MODEL_CONFIGURATIONS.items():
|
||||||
|
maybe_export_model(config_name, model_config)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
|
@pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
|
||||||
@ -232,15 +244,14 @@ def neuron_model_config(request):
|
|||||||
"""
|
"""
|
||||||
config_name = request.param
|
config_name = request.param
|
||||||
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
|
model_config = copy.deepcopy(MODEL_CONFIGURATIONS[request.param])
|
||||||
neuron_model_name = get_neuron_model_name(config_name)
|
# Export the model first (only if needed)
|
||||||
neuron_model_id = f"{TEST_ORGANIZATION}/{neuron_model_name}"
|
neuron_model_id = maybe_export_model(config_name, model_config)
|
||||||
with TemporaryDirectory() as neuron_model_path:
|
with TemporaryDirectory() as neuron_model_path:
|
||||||
hub = huggingface_hub.HfApi()
|
|
||||||
if not hub.repo_exists(neuron_model_id):
|
|
||||||
# Export the model first
|
|
||||||
export_model(config_name, model_config, neuron_model_name)
|
|
||||||
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
|
logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
|
||||||
hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
|
hub = huggingface_hub.HfApi()
|
||||||
|
hub.snapshot_download(
|
||||||
|
neuron_model_id, etag_timeout=30, local_dir=neuron_model_path
|
||||||
|
)
|
||||||
# Add dynamic parameters to the model configuration
|
# Add dynamic parameters to the model configuration
|
||||||
model_config["neuron_model_path"] = neuron_model_path
|
model_config["neuron_model_path"] = neuron_model_path
|
||||||
model_config["neuron_model_id"] = neuron_model_id
|
model_config["neuron_model_id"] = neuron_model_id
|
||||||
@ -257,3 +268,7 @@ def neuron_model_config(request):
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def neuron_model_path(neuron_model_config):
|
def neuron_model_path(neuron_model_config):
|
||||||
yield neuron_model_config["neuron_model_path"]
|
yield neuron_model_config["neuron_model_path"]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
maybe_export_models()
|
Loading…
Reference in New Issue
Block a user