diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index b5c241b2..63f245ae 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -78,10 +78,12 @@ jobs: authkey: ${{ secrets.TAILSCALE_AUTHKEY }} slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + - name: Initialize Docker Buildx uses: docker/setup-buildx-action@v3 with: install: true + - name: Login to GitHub Container Registry if: github.event_name != 'pull_request' uses: docker/login-action@v3 @@ -189,7 +191,7 @@ jobs: pwd echo "ls:" ls - python integration-tests/clean_cache.py + python integration-tests/clean_cache_and_download.py --token ${{ secrets.HUGGING_FACE_HUB_TOKEN }} fi @@ -207,6 +209,18 @@ jobs: - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4.4.1 + - name: Tailscale + uses: huggingface/tailscale-action@main + with: + authkey: ${{ secrets.TAILSCALE_AUTHKEY }} + + - name: Login to internal Container Registry + uses: docker/login-action@v3 + with: + username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }} + password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }} + registry: registry.internal.huggingface.tech + - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/integration-tests/clean_cache.py b/integration-tests/clean_cache.py deleted file mode 100644 index b0d8d510..00000000 --- a/integration-tests/clean_cache.py +++ /dev/null @@ -1,125 +0,0 @@ -import huggingface_hub - -REQUIRED_MODELS = { - "bigscience/bloom-560m": "main", - "TinyLlama/TinyLlama-1.1B-Chat-v1.0": "main", - "abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq": "main", - "tiiuae/falcon-7b": "main", - "TechxGenus/gemma-2b-GPTQ": "main", - "google/gemma-2b": "main", - "openai-community/gpt2": "main", - "turboderp/Llama-3-8B-Instruct-exl2": "2.5bpw", - "huggingface/llama-7b-gptq": "main", - "neuralmagic/llama-2-7b-chat-marlin": "main", - "huggingface/llama-7b": "main", - "FasterDecoding/medusa-vicuna-7b-v1.3": "refs/pr/1", - "mistralai/Mistral-7B-Instruct-v0.1": "main", - "OpenAssistant/oasst-sft-1-pythia-12b": "main", - "stabilityai/stablelm-tuned-alpha-3b": "main", - "google/paligemma-3b-pt-224": "main", - "microsoft/phi-2": "main", - "Qwen/Qwen1.5-0.5B": "main", - "bigcode/starcoder": "main", - "Narsil/starcoder-gptq": "main", - "bigcode/starcoder2-3b": "main", - "HuggingFaceM4/idefics-9b-instruct": "main", - "HuggingFaceM4/idefics2-8b": "main", - "llava-hf/llava-v1.6-mistral-7b-hf": "main", - "state-spaces/mamba-130m": "main", - "mosaicml/mpt-7b": "main", - "bigscience/mt0-base": "main", - "google/flan-t5-xxl": "main", -} - - -def cleanup_cache(): - # Retrieve the size per model for all models used in the CI. - size_per_model = {} - for model_id, revision in REQUIRED_MODELS.items(): - model_size = 0 - all_files = huggingface_hub.list_repo_files( - model_id, - repo_type="model", - revision=revision, - token=token, - ) - - extension = None - if any(".safetensors" in filename for filename in all_files): - extension = ".safetensors" - elif any(".pt" in filename for filename in all_files): - extension = ".pt" - elif any(".bin" in filename for filename in all_files): - extension = ".bin" - - for filename in all_files: - if filename.endswith(extension): - file_url = huggingface_hub.hf_hub_url( - model_id, filename, revision=revision - ) - file_metadata = huggingface_hub.get_hf_file_metadata( - file_url, token=token - ) - model_size += file_metadata.size * 1e-9 # in GB - - size_per_model[model_id] = model_size - - total_required_size = sum(size_per_model.values()) - print(f"Total required disk: {size_per_model:.2f} GB") - - cached_dir = huggingface_hub.scan_cache_dir() - - cache_size_per_model = {} - cached_required_size_per_model = {} - cached_shas_per_model = {} - - # Retrieve the SHAs and model ids of other non-necessary models in the cache. - for repo in cached_dir.repos: - if repo.repo_id in REQUIRED_MODELS: - cached_required_size_per_model[repo.repo_id] = ( - repo.size_on_disk * 1e-9 - ) # in GB - elif repo.repo_type == "model": - cache_size_per_model[repo.repo_id] = repo.size_on_disk * 1e-9 # in GB - - shas = [] - for _, ref in repo.refs.items(): - shas.append(ref.commit_hash) - cached_shas_per_model[repo.repo_id] = shas - - total_required_cached_size = sum(cached_required_size_per_model.values()) - total_other_cached_size = sum(cache_size_per_model.values()) - total_non_cached_required_size = total_required_size - total_required_cached_size - - print( - f"Total HF cached models size: {total_other_cached_size + total_required_cached_size:.2f} GB" - ) - print( - f"Total non-necessary HF cached models size: {total_other_cached_size:.2f} GB" - ) - - free_memory = shutil.disk_usage("/data").free * 1e-9 - print(f"Free memory: {free_memory:.2f} GB") - - if free_memory + total_other_cached_size < total_non_cached_required_size * 1.05: - raise ValueError( - "Not enough space on device to execute the complete CI, please clean up the CI machine" - ) - - while free_memory < total_non_cached_required_size * 1.05: - if len(cache_size_per_model) == 0: - raise ValueError("This should not happen.") - - largest_model_id = max(cache_size_per_model, key=cache_size_per_model.get) - - print("Removing", largest_model_id) - for sha in cached_shas_per_model[largest_model_id]: - huggingface_hub.scan_cache_dir().delete_revisions(sha).execute() - - del cache_size_per_model[largest_model_id] - - free_memory = shutil.disk_usage("/data").free * 1e-9 - - -if __name__ == "__main__": - cleanup_cache()