From 5b6367f87cfd304c676a2446344e8d2dc163f5cf Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Fri, 15 Dec 2023 11:35:31 +0100 Subject: [PATCH] fix imports --- .github/workflows/build.yaml | 78 ++++++++++--------- .../text_generation_server/models/__init__.py | 3 +- 2 files changed, 42 insertions(+), 39 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 395a0b6a..066ea889 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -146,11 +146,50 @@ jobs: cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min + integration-tests: + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + needs: + - start-runner + - build-and-push-image # Wait for the docker image to be built + runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner + env: + DOCKER_VOLUME: /cache + steps: + - uses: actions/checkout@v2 + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4.4.1 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Tailscale + uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966 + with: + authkey: ${{ secrets.TAILSCALE_AUTHKEY }} + - name: Prepare disks + run: | + sudo mkfs -t ext4 /dev/nvme1n1 + sudo mkdir ${{ env.DOCKER_VOLUME }} + sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }} + - name: Install + run: | + make install-integration-tests + - name: Run tests + run: | + export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }} + export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} + pytest -s -vv integration-tests + build-and-push-image-rocm: concurrency: group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }} cancel-in-progress: true - needs: start-runner # required to start the main job when the runner is ready + needs: + - start-runner + - build-and-push-image # Wait for the main docker image to be built + - integration-tests # Wait for the main integration-tests runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner permissions: contents: write @@ -235,43 +274,6 @@ jobs: cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min - integration-tests: - concurrency: - group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - needs: - - start-runner - - build-and-push-image # Wait for the docker image to be built - - build-and-push-image-rocm - runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner - env: - DOCKER_VOLUME: /cache - steps: - - uses: actions/checkout@v2 - - name: Inject slug/short variables - uses: rlespinasse/github-slug-action@v4.4.1 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - name: Tailscale - uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966 - with: - authkey: ${{ secrets.TAILSCALE_AUTHKEY }} - - name: Prepare disks - run: | - sudo mkfs -t ext4 /dev/nvme1n1 - sudo mkdir ${{ env.DOCKER_VOLUME }} - sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }} - - name: Install - run: | - make install-integration-tests - - name: Run tests - run: | - export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }} - export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} - pytest -s -vv integration-tests - stop-runner: name: Stop self-hosted EC2 runner needs: diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index a90b6215..1bbff16a 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -5,7 +5,6 @@ from transformers.configuration_utils import PretrainedConfig from transformers.models.auto import modeling_auto from typing import Optional -from text_generation_server.utils.flash_attn import HAS_FLASH_ATTN_V2_CUDA from text_generation_server.utils.speculate import get_speculate, set_speculate from text_generation_server.models.model import Model from text_generation_server.models.causal_lm import CausalLM @@ -58,10 +57,12 @@ try: from text_generation_server.models.idefics import IDEFICSSharded from text_generation_server.models.flash_mistral import FlashMistral from text_generation_server.models.flash_mixtral import FlashMixtral + from text_generation_server.utils.flash_attn import HAS_FLASH_ATTN_V2_CUDA except ImportError as e: logger.warning(f"Could not import Flash Attention enabled models: {e}") FLASH_ATTENTION = False + HAS_FLASH_ATTN_V2_CUDA = False if FLASH_ATTENTION: __all__.append(FlashNeoXSharded)