text-generation-inference/integration-tests/models/test_flash_qwen2_vl_warmup.py
drbh eecca27113
feat: improve qwen2-vl startup (#2802)
* feat: tokenize each request individually and increase warmup image size

* feat: adjust rotary embed and avoid cuda graphs of size 2 and smaller

* fix: address image resize and rebase changes

* feat: update to run qwen2-vl tests

* fix: tweak param types
2025-01-17 11:50:41 -05:00

39 lines
957 B
Python

import pytest
@pytest.fixture(scope="module")
def flash_qwen2_vl_handle(launcher):
with launcher(
"Qwen/Qwen2-VL-2B-Instruct",
max_input_length=40,
max_batch_prefill_tokens=50,
max_total_tokens=51,
) as handle:
yield handle
@pytest.fixture(scope="module")
async def flash_qwen2(flash_qwen2_vl_handle):
await flash_qwen2_vl_handle.health(300)
return flash_qwen2_vl_handle.client
@pytest.mark.private
async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot):
response = await flash_qwen2.chat(
max_tokens=20,
seed=42,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What is the color of the sky?"},
],
},
],
)
assert response.choices[0].message.content == "The correct answer is: blue"
assert response == response_snapshot