feat: relax mistral requirements (#1351)

Close #1253 Close #1279
2025-09-18 07:44:53 +00:00 · 2023-12-15 12:52:24 +01:00 · 2023-12-15 12:52:24 +01:00 · a95e6d603d
commit a95e6d603d
parent 3600fc9dbe
6 changed files with 1176 additions and 1096 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -146,11 +146,50 @@ jobs:
          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
  integration-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    needs:
      - start-runner
      - build-and-push-image # Wait for the docker image to be built
    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    env:
      DOCKER_VOLUME: /cache
    steps:
      - uses: actions/checkout@v2
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: 3.9
      - name: Tailscale
        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
        with:
          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
      - name: Prepare disks
        run: |
          sudo mkfs -t ext4 /dev/nvme1n1
          sudo mkdir ${{ env.DOCKER_VOLUME }}
          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
      - name: Install
        run: |
          make install-integration-tests
      - name: Run tests
        run: |
          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          pytest -s -vv integration-tests
  build-and-push-image-rocm:
    concurrency:
      group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
-    needs: start-runner # required to start the main job when the runner is ready
+    needs:
      - start-runner
      - build-and-push-image # Wait for the main docker image to be built
      - integration-tests # Wait for the main integration-tests
    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    permissions:
      contents: write
@ -235,43 +274,6 @@ jobs:
          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
  integration-tests:
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    needs:
      - start-runner
      - build-and-push-image # Wait for the docker image to be built
      - build-and-push-image-rocm
    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
    env:
      DOCKER_VOLUME: /cache
    steps:
      - uses: actions/checkout@v2
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: 3.9
      - name: Tailscale
        uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
        with:
          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
      - name: Prepare disks
        run: |
          sudo mkfs -t ext4 /dev/nvme1n1
          sudo mkdir ${{ env.DOCKER_VOLUME }}
          sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
      - name: Install
        run: |
          make install-integration-tests
      - name: Run tests
        run: |
          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          pytest -s -vv integration-tests
  stop-runner:
    name: Stop self-hosted EC2 runner
    needs:
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@ -1,5 +1,5 @@
 backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
-bitsandbytes==0.41.2.post2 ; python_version >= "3.9" and python_version < "3.13"
+bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "3.13"
 certifi==2023.11.17 ; python_version >= "3.9" and python_version < "3.13"
 charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
@ -8,14 +8,14 @@ deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
 fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.61.0 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.62.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.6 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
@ -37,11 +37,11 @@ safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.15.0 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.33.3 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.36.1 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@ -7,14 +7,14 @@ deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
 fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
-googleapis-common-protos==1.61.0 ; python_version >= "3.9" and python_version < "3.13"
+googleapis-common-protos==1.62.0 ; python_version >= "3.9" and python_version < "3.13"
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-reflection==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-reflection==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio-status==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio-status==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
-huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
+huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
-idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
+idna==3.6 ; python_version >= "3.9" and python_version < "3.13"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
 numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
@ -36,11 +36,11 @@ safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.15.0 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.33.3 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.36.1 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
-typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
+typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
 win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
 wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -27,11 +27,6 @@ from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.flash_attn import (
    attention,
    HAS_FLASH_ATTN_V2_ROCM,
    HAS_FLASH_ATTN_V2_CUDA,
 )
 from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
@ -43,10 +38,6 @@ from text_generation_server.utils.layers import (
 )
 if not HAS_FLASH_ATTN_V2_CUDA and not HAS_FLASH_ATTN_V2_ROCM:
    raise ImportError("Mistral model requires flash attn v2")
 class MistralConfig(PretrainedConfig):
    model_type = "mistral"
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@ -27,12 +27,9 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 from loguru import logger
 from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.flash_attn import (
    HAS_FLASH_ATTN_V2_ROCM,
    HAS_FLASH_ATTN_V2_CUDA,
 )
 from text_generation_server.utils.layers import (
    FastLinear,
    FastRMSNorm,
@ -44,18 +41,13 @@ from text_generation_server.utils.layers import (
    get_linear,
 )
-if not HAS_FLASH_ATTN_V2_CUDA and not HAS_FLASH_ATTN_V2_ROCM:
+HAS_MEGABLOCKS = True
    raise ImportError("Mixtral model requires flash attn v2")
 try:
    import megablocks.ops as ops
 except ImportError:
    raise ImportError("Mixtral model requires megablocks to be installed")
 try:
    import stk
    import megablocks.ops as ops
 except ImportError:
-    raise ImportError("Mixtral model requires stk to be installed")
+    logger.warning("Mixtral: megablocks is not installed")
    HAS_MEGABLOCKS = False
 class MixtralConfig(PretrainedConfig):
@ -590,7 +582,7 @@ class BlockSparseMoE(nn.Module):
        return out
    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if len(x) > 256:
+        if len(x) > 256 and HAS_MEGABLOCKS:
            return self.sparse_forward(x)
        # This is faster when there is not a lot of tokens
        return self.dense_forward(x)