Make awq install optional + integration tests values.

2025-09-10 20:04:52 +00:00 · 2023-09-25 09:19:12 +00:00 · 2023-09-25 09:19:12 +00:00 · 02d4f62a1f
commit 02d4f62a1f
parent a8f870aa75
6 changed files with 556 additions and 12 deletions
--- a/2
+++ b/2
@ -175,6 +175,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 # Copy build artifacts from exllama kernels builder
 COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
--- a/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq.json
+++ b/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -8.515625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -15.4140625,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29896,
+        "logprob": -2.0292969,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 13,
+        "logprob": -2.2597656,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 30166,
+        "logprob": -3.8671875,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -0.24523926,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -0.07897949,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -0.023513794,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -0.011444092,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -0.008430481,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": -0.007648468,
+        "special": false,
+        "text": ""
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "1\n"
+}
--- a/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_all_params.json
+++ b/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_all_params.json
@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -8.515625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -15.4140625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 29896,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 13,
+        "logprob": -0.6254883,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 29918,
+        "logprob": -0.20141602,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.6254883,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29871,
+        "logprob": 0.0,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": ""
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": ""
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request1\n_2 "
+}
--- a/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_load.json
+++ b/integration-tests/models/snapshots/test_flash_awq/test_flash_llama_awq_load.json
@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": ""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": ""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": ""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": ""
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": ""
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n"
+  }
+]
--- a/integration-tests/models/test_flash_awq.py
+++ b/integration-tests/models/test_flash_awq.py
@ -2,21 +2,21 @@ import pytest


@pytest.fixture(scope="module")
-def flash_llama_gptq_handle(launcher):
+def flash_llama_awq_handle(launcher):
    with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=2, quantize="awq") as handle:
        yield handle


@pytest.fixture(scope="module")
-async def flash_llama_gptq(flash_llama_gptq_handle):
-    await flash_llama_gptq_handle.health(300)
-    return flash_llama_gptq_handle.client
+async def flash_llama_awq(flash_llama_awq_handle):
+    await flash_llama_awq_handle.health(300)
+    return flash_llama_awq_handle.client


@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
-    response = await flash_llama_gptq.generate(
+async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )

@ -26,8 +26,8 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):

@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
-    response = await flash_llama_gptq.generate(
+async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
@ -48,11 +48,11 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):

@pytest.mark.asyncio
@pytest.mark.private
-async def test_flash_llama_gptq_load(
-    flash_llama_gptq, generate_load, response_snapshot
+async def test_flash_llama_awq_load(
+    flash_llama_awq, generate_load, response_snapshot
 ):
    responses = await generate_load(
-        flash_llama_gptq, "Test request", max_new_tokens=10, n=4
+        flash_llama_awq, "Test request", max_new_tokens=10, n=4
    )

    assert len(responses) == 4
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -17,7 +17,13 @@ except ImportError:
 from accelerate import init_empty_weights

 from text_generation_server.utils.gptq.quant_linear import QuantLinear
+
+
+HAS_AWQ = True
+try: 
    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
+except ImportError:
+    HAS_AWQ = False

 try:
    major, _minor = torch.cuda.get_device_capability()