fix workflow

This commit is contained in:
Felix Marty 2024-06-20 09:28:10 +00:00 committed by Nicolas Patry
parent 5fb8c275c3
commit 67999773f3
No known key found for this signature in database
GPG Key ID: E939E8CC91A1C674
2 changed files with 22 additions and 4 deletions

View File

@ -172,16 +172,24 @@ jobs:
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }} group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true cancel-in-progress: true
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest' if: needs.build-and-push.outputs.runs_on == 'amd-gpu-tgi'
container: container:
image: ${{ needs.build-and-push.outputs.docker_image }} image: ${{ needs.build-and-push.outputs.docker_image }}
options: --shm-size "16gb" --ipc host -v ${{ needs.build-and-push.outputs.docker_volume }}:/data options: --shm-size "16gb" --ipc host -v ${{ needs.build-and-push.outputs.docker_volume }}:/data
steps: steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Clean Hugging Face cache - name: Clean Hugging Face cache
shell: bash
run: | run: |
if [[ ${{ inputs.hardware }} == "rocm" ]] if [[ ${{ inputs.hardware }} == "rocm" ]]
then then
python clean_cache.py echo "pwd:"
pwd
echo "ls:"
ls
python integration-tests/clean_cache.py
fi fi

View File

@ -64,6 +64,9 @@ def cleanup_cache():
size_per_model[model_id] = model_size size_per_model[model_id] = model_size
total_required_size = sum(size_per_model.values())
print(f"Total required disk: {size_per_model:.2f} GB")
cached_dir = huggingface_hub.scan_cache_dir() cached_dir = huggingface_hub.scan_cache_dir()
cache_size_per_model = {} cache_size_per_model = {}
@ -86,11 +89,18 @@ def cleanup_cache():
total_required_cached_size = sum(cached_required_size_per_model.values()) total_required_cached_size = sum(cached_required_size_per_model.values())
total_other_cached_size = sum(cache_size_per_model.values()) total_other_cached_size = sum(cache_size_per_model.values())
total_required_size = sum(size_per_model.values())
total_non_cached_required_size = total_required_size - total_required_cached_size total_non_cached_required_size = total_required_size - total_required_cached_size
print(
f"Total HF cached models size: {total_other_cached_size + total_required_cached_size:.2f} GB"
)
print(
f"Total non-necessary HF cached models size: {total_other_cached_size:.2f} GB"
)
free_memory = shutil.disk_usage("/data").free * 1e-9 free_memory = shutil.disk_usage("/data").free * 1e-9
print(f"Free memory: {free_memory:.2f} GB")
if free_memory + total_other_cached_size < total_non_cached_required_size * 1.05: if free_memory + total_other_cached_size < total_non_cached_required_size * 1.05:
raise ValueError( raise ValueError(
"Not enough space on device to execute the complete CI, please clean up the CI machine" "Not enough space on device to execute the complete CI, please clean up the CI machine"