From 9eaa163239c556592c2a6197060ee703a0a05921 Mon Sep 17 00:00:00 2001 From: drbh Date: Fri, 31 Jan 2025 18:30:32 +0000 Subject: [PATCH] fix: add more test and improve model generation --- .../test_flash_qwen2_vl_bay.json | 26 ++++++++++ .../test_flash_qwen2_vl_inpaint.json | 26 ++++++++++ .../test_flash_qwen2_vl_simple.json | 10 ++-- .../models/test_flash_qwen2_vl.py | 50 +++++++++++++++++-- .../models/custom_modeling/qwen2_vl.py | 1 - 5 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json create mode 100644 integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json new file mode 100644 index 00000000..25a1abc7 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_bay.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "The image showcases a stunning cityscape, featuring the iconic Statue of Liberty in the foreground. The image displays Lady Liberty's imposing presence, with her towering base standing beside her. Behind the statue, the city's skyline extends across the horizon, adorned with numerous tall buildings, including the Empire State Building and other notable skyscrapers. The water reflecting the sun's rays creates a serene and picturesque scene, emphasizing the beauty and resilience of this global landmark. The sky is a clear, pale blue, adding to the overall tranquility of the scene.", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1738348090, + "id": "", + "model": "Qwen/Qwen2-VL-7B-Instruct", + "object": "chat.completion", + "system_fingerprint": "3.1.1-dev0-native", + "usage": { + "completion_tokens": 110, + "prompt_tokens": 8736, + "total_tokens": 8846 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json new file mode 100644 index 00000000..325e658f --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_inpaint.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "The image shows a stylized scene set in what appears to be a diner or restaurant. In the foreground, there is a table with various food items, including a burger with lettuce and tomato, a bowl of fries, and a drink in a cup with a straw. On the right side of the table, there is an owl sitting alertly, looking directly at the camera. Behind the owl and the table, there is a large, green, dinosaur-like creature resembling Godzilla, with its mouth open and tongue visible. In the background, the diner's decor includes various signs and posters, with a green sign reading \"Basta\" and another sign that says \"Tabasco.\" The setting has a retro or vintage feel, with fluorescent lighting overhead and clean, polished surfaces.", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1738348100, + "id": "", + "model": "Qwen/Qwen2-VL-7B-Instruct", + "object": "chat.completion", + "system_fingerprint": "3.1.1-dev0-native", + "usage": { + "completion_tokens": 156, + "prompt_tokens": 5375, + "total_tokens": 5531 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json index 49f33225..6b6017c9 100644 --- a/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json +++ b/integration-tests/models/__snapshots__/test_flash_qwen2_vl/test_flash_qwen2_vl_simple.json @@ -5,7 +5,7 @@ "index": 0, "logprobs": null, "message": { - "content": "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape.", + "content": "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character.", "name": null, "role": "assistant", "tool_calls": null @@ -13,14 +13,14 @@ "usage": null } ], - "created": 1737645979, + "created": 1738347908, "id": "", "model": "Qwen/Qwen2-VL-7B-Instruct", "object": "chat.completion", - "system_fingerprint": "3.0.2-dev0-native", + "system_fingerprint": "3.1.1-dev0-native", "usage": { - "completion_tokens": 58, + "completion_tokens": 89, "prompt_tokens": 1364, - "total_tokens": 1422 + "total_tokens": 1453 } } diff --git a/integration-tests/models/test_flash_qwen2_vl.py b/integration-tests/models/test_flash_qwen2_vl.py index dacd92a8..5a12eba8 100644 --- a/integration-tests/models/test_flash_qwen2_vl.py +++ b/integration-tests/models/test_flash_qwen2_vl.py @@ -35,7 +35,7 @@ async def test_flash_qwen2_vl_simple(flash_qwen2, response_snapshot): assert ( response.choices[0].message.content - == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape." + == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character." ) assert response == response_snapshot @@ -72,7 +72,51 @@ async def test_flash_qwen2_vl_simple_streaming(flash_qwen2, response_snapshot): assert ( generated - == "The image depicts an anthropomorphic rabbit, wearing a futuristic spacesuit, in an extraterrestrial environment. The setting appears to be a red planet resembling Mars, with rugged terrain and rocky formations in the background. The moon is visible in the distant sky, adding to the lunar landscape." + == "The image depicts an anthropomorphic rabbit, wearing a spacesuit, standing in a barren, rocky landscape that resembles the surface of another planet, possibly Mars. The rabbit has a red digestive system label on its chest, and the surrounding environment features red sandy terrain and a hazy, floating planet or moon in the background. The scene has a surreal, fantastical quality, blending elements of science fiction and space exploration with a whimsical character." ) - assert count == 58 + assert count == 89 assert last_response == response_snapshot + + +@pytest.mark.private +async def test_flash_qwen2_vl_bay(flash_qwen2, response_snapshot): + response = await flash_qwen2.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + }, + }, + {"type": "text", "text": "Describe the image"}, + ], + }, + ], + ) + assert response == response_snapshot + + +@pytest.mark.private +async def test_flash_qwen2_vl_inpaint(flash_qwen2, response_snapshot): + response = await flash_qwen2.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png" + }, + }, + {"type": "text", "text": "Describe the image"}, + ], + }, + ], + ) + assert response == response_snapshot diff --git a/server/text_generation_server/models/custom_modeling/qwen2_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_vl.py index 65d18963..4031fe8f 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_vl.py @@ -543,7 +543,6 @@ class Qwen2VLForConditionalGeneration(nn.Module): true_max_s=max_s, prefill_cache_indices=prefill_cache_indices, ) - hidden_states, _ = self.norm(hidden_states) if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] logits, speculative_logits = self.lm_head(hidden_states)