From 67999773f3146d6b45b2654fb69c7841b72be256 Mon Sep 17 00:00:00 2001 From: Felix Marty <9808326+fxmarty@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:28:10 +0000 Subject: [PATCH] fix workflow --- .github/workflows/build.yaml | 12 ++++++++++-- integration-tests/clean_cache.py | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ad6d9827..b5c241b2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -172,16 +172,24 @@ jobs: concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true - if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest' + if: needs.build-and-push.outputs.runs_on == 'amd-gpu-tgi' container: image: ${{ needs.build-and-push.outputs.docker_image }} options: --shm-size "16gb" --ipc host -v ${{ needs.build-and-push.outputs.docker_volume }}:/data steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Clean Hugging Face cache + shell: bash run: | if [[ ${{ inputs.hardware }} == "rocm" ]] then - python clean_cache.py + echo "pwd:" + pwd + echo "ls:" + ls + python integration-tests/clean_cache.py fi diff --git a/integration-tests/clean_cache.py b/integration-tests/clean_cache.py index 58a78dd4..b0d8d510 100644 --- a/integration-tests/clean_cache.py +++ b/integration-tests/clean_cache.py @@ -64,6 +64,9 @@ def cleanup_cache(): size_per_model[model_id] = model_size + total_required_size = sum(size_per_model.values()) + print(f"Total required disk: {size_per_model:.2f} GB") + cached_dir = huggingface_hub.scan_cache_dir() cache_size_per_model = {} @@ -86,11 +89,18 @@ def cleanup_cache(): total_required_cached_size = sum(cached_required_size_per_model.values()) total_other_cached_size = sum(cache_size_per_model.values()) - total_required_size = sum(size_per_model.values()) - total_non_cached_required_size = total_required_size - total_required_cached_size + print( + f"Total HF cached models size: {total_other_cached_size + total_required_cached_size:.2f} GB" + ) + print( + f"Total non-necessary HF cached models size: {total_other_cached_size:.2f} GB" + ) + free_memory = shutil.disk_usage("/data").free * 1e-9 + print(f"Free memory: {free_memory:.2f} GB") + if free_memory + total_other_cached_size < total_non_cached_required_size * 1.05: raise ValueError( "Not enough space on device to execute the complete CI, please clean up the CI machine"