Make awq install optional + integration tests values.

This commit is contained in:
Nicolas Patry 2023-09-25 09:19:12 +00:00
parent a8f870aa75
commit 02d4f62a1f
6 changed files with 556 additions and 12 deletions

View File

@ -175,6 +175,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

View File

@ -0,0 +1,89 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.515625,
"text": "Test"
},
{
"id": 2009,
"logprob": -15.4140625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29896,
"logprob": -2.0292969,
"special": false,
"text": "1"
},
{
"id": 13,
"logprob": -2.2597656,
"special": false,
"text": "\n"
},
{
"id": 30166,
"logprob": -3.8671875,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -1.0488281,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.24523926,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.07897949,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.023513794,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.011444092,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.008430481,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.007648468,
"special": false,
"text": ""
}
],
"top_tokens": null
},
"generated_text": "1\n"
}

View File

@ -0,0 +1,89 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.515625,
"text": "Test"
},
{
"id": 2009,
"logprob": -15.4140625,
"text": "request"
}
],
"seed": 0,
"tokens": [
{
"id": 29896,
"logprob": 0.0,
"special": false,
"text": "1"
},
{
"id": 13,
"logprob": -0.6254883,
"special": false,
"text": "\n"
},
{
"id": 30166,
"logprob": 0.0,
"special": false,
"text": ""
},
{
"id": 29918,
"logprob": -0.20141602,
"special": false,
"text": "_"
},
{
"id": 29906,
"logprob": -0.6254883,
"special": false,
"text": "2"
},
{
"id": 29871,
"logprob": 0.0,
"special": false,
"text": " "
},
{
"id": 30166,
"logprob": 0.0,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": 0.0,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": 0.0,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": 0.0,
"special": false,
"text": ""
}
],
"top_tokens": null
},
"generated_text": "Test request1\n_2 "
}

View File

@ -0,0 +1,358 @@
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.515625,
"text": "Test"
},
{
"id": 2009,
"logprob": -15.4140625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29896,
"logprob": -2.0292969,
"special": false,
"text": "1"
},
{
"id": 13,
"logprob": -2.2617188,
"special": false,
"text": "\n"
},
{
"id": 30166,
"logprob": -3.8671875,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -1.0498047,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.24523926,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.07897949,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.023529053,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.011444092,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.008300781,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.007648468,
"special": false,
"text": ""
}
],
"top_tokens": null
},
"generated_text": "1\n"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.515625,
"text": "Test"
},
{
"id": 2009,
"logprob": -15.4140625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29896,
"logprob": -2.0292969,
"special": false,
"text": "1"
},
{
"id": 13,
"logprob": -2.2617188,
"special": false,
"text": "\n"
},
{
"id": 30166,
"logprob": -3.8671875,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -1.0498047,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.24523926,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.07897949,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.023529053,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.011444092,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.008300781,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.007648468,
"special": false,
"text": ""
}
],
"top_tokens": null
},
"generated_text": "1\n"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.515625,
"text": "Test"
},
{
"id": 2009,
"logprob": -15.4140625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29896,
"logprob": -2.0292969,
"special": false,
"text": "1"
},
{
"id": 13,
"logprob": -2.2617188,
"special": false,
"text": "\n"
},
{
"id": 30166,
"logprob": -3.8671875,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -1.0498047,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.24523926,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.07897949,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.023529053,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.011444092,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.008300781,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.007648468,
"special": false,
"text": ""
}
],
"top_tokens": null
},
"generated_text": "1\n"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 1,
"logprob": null,
"text": "<s>"
},
{
"id": 4321,
"logprob": -8.515625,
"text": "Test"
},
{
"id": 2009,
"logprob": -15.4140625,
"text": "request"
}
],
"seed": null,
"tokens": [
{
"id": 29896,
"logprob": -2.0292969,
"special": false,
"text": "1"
},
{
"id": 13,
"logprob": -2.2617188,
"special": false,
"text": "\n"
},
{
"id": 30166,
"logprob": -3.8671875,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -1.0498047,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.24523926,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.07897949,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.023529053,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.011444092,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.008300781,
"special": false,
"text": ""
},
{
"id": 30166,
"logprob": -0.007648468,
"special": false,
"text": ""
}
],
"top_tokens": null
},
"generated_text": "1\n"
}
]

View File

@ -2,21 +2,21 @@ import pytest
@pytest.fixture(scope="module")
def flash_llama_gptq_handle(launcher):
def flash_llama_awq_handle(launcher):
with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=2, quantize="awq") as handle:
yield handle
@pytest.fixture(scope="module")
async def flash_llama_gptq(flash_llama_gptq_handle):
await flash_llama_gptq_handle.health(300)
return flash_llama_gptq_handle.client
async def flash_llama_awq(flash_llama_awq_handle):
await flash_llama_awq_handle.health(300)
return flash_llama_awq_handle.client
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
response = await flash_llama_gptq.generate(
async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
response = await flash_llama_awq.generate(
"Test request", max_new_tokens=10, decoder_input_details=True
)
@ -26,8 +26,8 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
response = await flash_llama_gptq.generate(
async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
response = await flash_llama_awq.generate(
"Test request",
max_new_tokens=10,
repetition_penalty=1.2,
@ -48,11 +48,11 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
@pytest.mark.asyncio
@pytest.mark.private
async def test_flash_llama_gptq_load(
flash_llama_gptq, generate_load, response_snapshot
async def test_flash_llama_awq_load(
flash_llama_awq, generate_load, response_snapshot
):
responses = await generate_load(
flash_llama_gptq, "Test request", max_new_tokens=10, n=4
flash_llama_awq, "Test request", max_new_tokens=10, n=4
)
assert len(responses) == 4

View File

@ -17,7 +17,13 @@ except ImportError:
from accelerate import init_empty_weights
from text_generation_server.utils.gptq.quant_linear import QuantLinear
HAS_AWQ = True
try:
from text_generation_server.utils.awq.quantize.qmodule import WQLinear
except ImportError:
HAS_AWQ = False
try:
major, _minor = torch.cuda.get_device_capability()