Adding a test for FD.

2025-09-12 04:44:52 +00:00 · 2024-09-12 11:12:18 +02:00 · 2024-09-12 11:12:18 +02:00 · f6697baf31
commit f6697baf31
parent 9cca3e0b03
3 changed files with 2784 additions and 0 deletions
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -342,6 +342,7 @@ def launcher(event_loop):
        max_total_tokens: Optional[int] = None,
        lora_adapters: Optional[List[str]] = None,
        cuda_graphs: Optional[List[int]] = None,
+        attention: Optional[str] = None,
    ):
        port = random.randint(8000, 10_000)
        master_port = random.randint(10_000, 20_000)
@ -401,6 +402,8 @@ def launcher(event_loop):

        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"
+        if attention is not None:
+            env["ATTENTION"] = attention

        with tempfile.TemporaryFile("w+") as tmp:
            # We'll output stdout/stderr to a temporary file. Using a pipe
@ -437,6 +440,7 @@ def launcher(event_loop):
        max_total_tokens: Optional[int] = None,
        lora_adapters: Optional[List[str]] = None,
        cuda_graphs: Optional[List[int]] = None,
+        attention: Optional[str] = None,
    ):
        port = random.randint(8000, 10_000)

@ -491,6 +495,8 @@ def launcher(event_loop):
        }
        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"
+        if attention is not None:
+            env["ATTENTION"] = attention

        if HF_TOKEN is not None:
            env["HF_TOKEN"] = HF_TOKEN
--- a/integration-tests/models/snapshots/test_flash_llama_prefix/test_flash_llama_flashdecoding.json
+++ b/integration-tests/models/snapshots/test_flash_llama_prefix/test_flash_llama_flashdecoding.json
--- a/integration-tests/models/test_flash_llama_prefix.py
+++ b/integration-tests/models/test_flash_llama_prefix.py