mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-22 23:42:06 +00:00
parent
3600fc9dbe
commit
a95e6d603d
78
.github/workflows/build.yaml
vendored
78
.github/workflows/build.yaml
vendored
@ -146,11 +146,50 @@ jobs:
|
|||||||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
|
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
|
||||||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
|
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min
|
||||||
|
|
||||||
|
integration-tests:
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
needs:
|
||||||
|
- start-runner
|
||||||
|
- build-and-push-image # Wait for the docker image to be built
|
||||||
|
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
|
||||||
|
env:
|
||||||
|
DOCKER_VOLUME: /cache
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Inject slug/short variables
|
||||||
|
uses: rlespinasse/github-slug-action@v4.4.1
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: 3.9
|
||||||
|
- name: Tailscale
|
||||||
|
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
|
||||||
|
with:
|
||||||
|
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
|
||||||
|
- name: Prepare disks
|
||||||
|
run: |
|
||||||
|
sudo mkfs -t ext4 /dev/nvme1n1
|
||||||
|
sudo mkdir ${{ env.DOCKER_VOLUME }}
|
||||||
|
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
|
||||||
|
- name: Install
|
||||||
|
run: |
|
||||||
|
make install-integration-tests
|
||||||
|
- name: Run tests
|
||||||
|
run: |
|
||||||
|
export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
|
||||||
|
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||||
|
pytest -s -vv integration-tests
|
||||||
|
|
||||||
build-and-push-image-rocm:
|
build-and-push-image-rocm:
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
needs: start-runner # required to start the main job when the runner is ready
|
needs:
|
||||||
|
- start-runner
|
||||||
|
- build-and-push-image # Wait for the main docker image to be built
|
||||||
|
- integration-tests # Wait for the main integration-tests
|
||||||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
|
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
@ -235,43 +274,6 @@ jobs:
|
|||||||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
|
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
|
||||||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
|
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
|
||||||
|
|
||||||
integration-tests:
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
needs:
|
|
||||||
- start-runner
|
|
||||||
- build-and-push-image # Wait for the docker image to be built
|
|
||||||
- build-and-push-image-rocm
|
|
||||||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
|
|
||||||
env:
|
|
||||||
DOCKER_VOLUME: /cache
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
- name: Inject slug/short variables
|
|
||||||
uses: rlespinasse/github-slug-action@v4.4.1
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: 3.9
|
|
||||||
- name: Tailscale
|
|
||||||
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
|
|
||||||
with:
|
|
||||||
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
|
|
||||||
- name: Prepare disks
|
|
||||||
run: |
|
|
||||||
sudo mkfs -t ext4 /dev/nvme1n1
|
|
||||||
sudo mkdir ${{ env.DOCKER_VOLUME }}
|
|
||||||
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
|
|
||||||
- name: Install
|
|
||||||
run: |
|
|
||||||
make install-integration-tests
|
|
||||||
- name: Run tests
|
|
||||||
run: |
|
|
||||||
export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
|
|
||||||
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
|
||||||
pytest -s -vv integration-tests
|
|
||||||
|
|
||||||
stop-runner:
|
stop-runner:
|
||||||
name: Stop self-hosted EC2 runner
|
name: Stop self-hosted EC2 runner
|
||||||
needs:
|
needs:
|
||||||
|
2127
server/poetry.lock
generated
2127
server/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
|||||||
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
bitsandbytes==0.41.2.post2 ; python_version >= "3.9" and python_version < "3.13"
|
bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
certifi==2023.11.17 ; python_version >= "3.9" and python_version < "3.13"
|
certifi==2023.11.17 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
|
charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
@ -8,14 +8,14 @@ deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
|||||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
googleapis-common-protos==1.61.0 ; python_version >= "3.9" and python_version < "3.13"
|
googleapis-common-protos==1.62.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-reflection==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-reflection==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-status==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-status==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
|
huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
|
idna==3.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
|
numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
@ -37,11 +37,11 @@ safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
|
|||||||
scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
|
scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
|
tokenizers==0.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
transformers==4.33.3 ; python_version >= "3.9" and python_version < "3.13"
|
transformers==4.36.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
|
typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
@ -7,14 +7,14 @@ deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
|||||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
googleapis-common-protos==1.61.0 ; python_version >= "3.9" and python_version < "3.13"
|
googleapis-common-protos==1.62.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-reflection==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-reflection==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio-status==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio-status==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
grpcio==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
grpcio==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
|
huggingface-hub==0.19.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
|
idna==3.6 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
|
numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
@ -36,11 +36,11 @@ safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
|
|||||||
scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
|
scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
|
tokenizers==0.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
transformers==4.33.3 ; python_version >= "3.9" and python_version < "3.13"
|
transformers==4.36.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
|
typing-extensions==4.9.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||||
|
@ -27,11 +27,6 @@ from transformers.configuration_utils import PretrainedConfig
|
|||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.flash_attn import (
|
|
||||||
attention,
|
|
||||||
HAS_FLASH_ATTN_V2_ROCM,
|
|
||||||
HAS_FLASH_ATTN_V2_CUDA,
|
|
||||||
)
|
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.utils.layers import (
|
||||||
TensorParallelRowLinear,
|
TensorParallelRowLinear,
|
||||||
TensorParallelColumnLinear,
|
TensorParallelColumnLinear,
|
||||||
@ -43,10 +38,6 @@ from text_generation_server.utils.layers import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if not HAS_FLASH_ATTN_V2_CUDA and not HAS_FLASH_ATTN_V2_ROCM:
|
|
||||||
raise ImportError("Mistral model requires flash attn v2")
|
|
||||||
|
|
||||||
|
|
||||||
class MistralConfig(PretrainedConfig):
|
class MistralConfig(PretrainedConfig):
|
||||||
model_type = "mistral"
|
model_type = "mistral"
|
||||||
|
|
||||||
|
@ -27,12 +27,9 @@ from torch import nn
|
|||||||
from transformers.activations import ACT2FN
|
from transformers.activations import ACT2FN
|
||||||
from transformers.configuration_utils import PretrainedConfig
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from text_generation_server.utils import paged_attention, flash_attn
|
from text_generation_server.utils import paged_attention, flash_attn
|
||||||
from text_generation_server.utils.flash_attn import (
|
|
||||||
HAS_FLASH_ATTN_V2_ROCM,
|
|
||||||
HAS_FLASH_ATTN_V2_CUDA,
|
|
||||||
)
|
|
||||||
from text_generation_server.utils.layers import (
|
from text_generation_server.utils.layers import (
|
||||||
FastLinear,
|
FastLinear,
|
||||||
FastRMSNorm,
|
FastRMSNorm,
|
||||||
@ -44,18 +41,13 @@ from text_generation_server.utils.layers import (
|
|||||||
get_linear,
|
get_linear,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not HAS_FLASH_ATTN_V2_CUDA and not HAS_FLASH_ATTN_V2_ROCM:
|
HAS_MEGABLOCKS = True
|
||||||
raise ImportError("Mixtral model requires flash attn v2")
|
|
||||||
|
|
||||||
try:
|
|
||||||
import megablocks.ops as ops
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError("Mixtral model requires megablocks to be installed")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import stk
|
import stk
|
||||||
|
import megablocks.ops as ops
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("Mixtral model requires stk to be installed")
|
logger.warning("Mixtral: megablocks is not installed")
|
||||||
|
HAS_MEGABLOCKS = False
|
||||||
|
|
||||||
|
|
||||||
class MixtralConfig(PretrainedConfig):
|
class MixtralConfig(PretrainedConfig):
|
||||||
@ -590,7 +582,7 @@ class BlockSparseMoE(nn.Module):
|
|||||||
return out
|
return out
|
||||||
|
|
||||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
if len(x) > 256:
|
if len(x) > 256 and HAS_MEGABLOCKS:
|
||||||
return self.sparse_forward(x)
|
return self.sparse_forward(x)
|
||||||
# This is faster when there is not a lot of tokens
|
# This is faster when there is not a lot of tokens
|
||||||
return self.dense_forward(x)
|
return self.dense_forward(x)
|
||||||
|
Loading…
Reference in New Issue
Block a user