mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-09-10 20:04:52 +00:00
remove require_backend decorators on handles, for some reasons fails in github actions
This commit is contained in:
parent
7e0f4f25c7
commit
284894303a
4
.github/workflows/build.yaml
vendored
4
.github/workflows/build.yaml
vendored
@ -191,7 +191,7 @@ jobs:
|
||||
pwd
|
||||
echo "ls:"
|
||||
ls
|
||||
python integration-tests/clean_cache_and_download.py --token ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
python integration-tests/clean_cache_and_download.py --token ${{ secrets.HF_TOKEN }}
|
||||
fi
|
||||
|
||||
|
||||
@ -239,7 +239,7 @@ jobs:
|
||||
- name: Run tests
|
||||
run: |
|
||||
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
||||
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HF_TOKEN }}
|
||||
|
||||
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
||||
echo "DOCKER_IMAGE:"
|
||||
|
@ -1,17 +1,15 @@
|
||||
import pytest
|
||||
|
||||
from testing_utils import require_backend_async, require_backend
|
||||
from testing_utils import require_backend_async
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda")
|
||||
def bloom_560_handle(launcher):
|
||||
with launcher("bigscience/bloom-560m") as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda")
|
||||
async def bloom_560(bloom_560_handle):
|
||||
await bloom_560_handle.health(240)
|
||||
return bloom_560_handle.client
|
||||
|
@ -1,10 +1,9 @@
|
||||
import pytest
|
||||
|
||||
from testing_utils import SYSTEM, is_flaky_async, require_backend_async, require_backend
|
||||
from testing_utils import SYSTEM, is_flaky_async, require_backend_async
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda", "rocm")
|
||||
def flash_llama_awq_handle_sharded(launcher):
|
||||
if SYSTEM == "rocm":
|
||||
# On ROCm, for awq checkpoints, we need to use gptq kernel that supports ROCm.
|
||||
@ -21,7 +20,6 @@ def flash_llama_awq_handle_sharded(launcher):
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda", "rocm")
|
||||
async def flash_llama_awq_sharded(flash_llama_awq_handle_sharded):
|
||||
await flash_llama_awq_handle_sharded.health(300)
|
||||
return flash_llama_awq_handle_sharded.client
|
||||
|
@ -1,19 +1,17 @@
|
||||
import pytest
|
||||
|
||||
from testing_utils import require_backend_async, require_backend
|
||||
from testing_utils import require_backend_async
|
||||
|
||||
# These tests do not pass on ROCm, that does not support head_dim > 128 (2b model is 256).
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda", "xpu")
|
||||
def flash_gemma_handle(launcher):
|
||||
with launcher("google/gemma-2b", num_shard=1) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda", "xpu")
|
||||
async def flash_gemma(flash_gemma_handle):
|
||||
await flash_gemma_handle.health(300)
|
||||
return flash_gemma_handle.client
|
||||
|
@ -1,19 +1,17 @@
|
||||
import pytest
|
||||
|
||||
from testing_utils import require_backend_async, require_backend
|
||||
from testing_utils import require_backend_async
|
||||
|
||||
# These tests do not pass on ROCm, that does not support head_dim > 128 (2b model is 256).
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda", "xpu")
|
||||
def flash_gemma_gptq_handle(launcher):
|
||||
with launcher("TechxGenus/gemma-2b-GPTQ", num_shard=1, quantize="gptq") as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda", "xpu")
|
||||
async def flash_gemma_gptq(flash_gemma_gptq_handle):
|
||||
await flash_gemma_gptq_handle.health(300)
|
||||
return flash_gemma_gptq_handle.client
|
||||
|
@ -3,7 +3,6 @@ from testing_utils import require_backend_async, require_backend
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda")
|
||||
def flash_llama_exl2_handle(launcher):
|
||||
with launcher(
|
||||
"turboderp/Llama-3-8B-Instruct-exl2",
|
||||
@ -18,7 +17,6 @@ def flash_llama_exl2_handle(launcher):
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda")
|
||||
async def flash_llama_exl2(flash_llama_exl2_handle):
|
||||
await flash_llama_exl2_handle.health(300)
|
||||
return flash_llama_exl2_handle.client
|
||||
|
@ -3,13 +3,12 @@ import requests
|
||||
import io
|
||||
import base64
|
||||
|
||||
from testing_utils import require_backend_async, require_backend
|
||||
from testing_utils import require_backend_async
|
||||
|
||||
# These tests do not pass on ROCm, that does not support head_dim > 128 (2b model is 256).
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda", "xpu")
|
||||
def flash_pali_gemma_handle(launcher):
|
||||
with launcher(
|
||||
"google/paligemma-3b-pt-224",
|
||||
@ -22,7 +21,6 @@ def flash_pali_gemma_handle(launcher):
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda", "xpu")
|
||||
async def flash_pali_gemma(flash_pali_gemma_handle):
|
||||
await flash_pali_gemma_handle.health(300)
|
||||
return flash_pali_gemma_handle.client
|
||||
|
@ -1,19 +1,17 @@
|
||||
import pytest
|
||||
|
||||
from testing_utils import require_backend_async, require_backend
|
||||
from testing_utils import require_backend_async
|
||||
|
||||
# These tests do not pass on ROCm, with different generations.
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda")
|
||||
def flash_phi_handle(launcher):
|
||||
with launcher("microsoft/phi-2", num_shard=1) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda")
|
||||
async def flash_phi(flash_phi_handle):
|
||||
await flash_phi_handle.health(300)
|
||||
return flash_phi_handle.client
|
||||
|
@ -4,14 +4,12 @@ from testing_utils import require_backend_async, require_backend
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend("cuda")
|
||||
def fused_kernel_mamba_handle(launcher):
|
||||
with launcher("state-spaces/mamba-130m", num_shard=1) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@require_backend_async("cuda")
|
||||
async def fused_kernel_mamba(fused_kernel_mamba_handle):
|
||||
await fused_kernel_mamba_handle.health(300)
|
||||
return fused_kernel_mamba_handle.client
|
||||
|
Loading…
Reference in New Issue
Block a user