From 659ce4f3fc940e598567ca6ae9de371aad3ef259 Mon Sep 17 00:00:00 2001 From: drbh Date: Fri, 14 Mar 2025 15:33:06 +0000 Subject: [PATCH] feat: add tests for image types and remove alpha from png --- ...est_flash_gemma3_image_base64_rgb_jpg.json | 26 +++++++ ...est_flash_gemma3_image_base64_rgb_png.json | 26 +++++++ .../test_flash_gemma3_image_base64_rgba.json | 26 +++++++ integration-tests/models/test_flash_gemma3.py | 77 +++++++++++++++++++ .../gemma3/image_processing_gemma3.py | 2 +- 5 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json create mode 100644 integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json new file mode 100644 index 000000000..ae67e0060 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white canvas or a completely white square. \n\nIs there anything specific you'd like me to do with this image, such as describe it further or imagine what it might represent?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741965894, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 74, + "prompt_tokens": 277, + "total_tokens": 351 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json new file mode 100644 index 000000000..afbfba30a --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741965892, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 98, + "prompt_tokens": 277, + "total_tokens": 375 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json new file mode 100644 index 000000000..1b97d2615 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741966313, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 67, + "prompt_tokens": 277, + "total_tokens": 344 + } +} diff --git a/integration-tests/models/test_flash_gemma3.py b/integration-tests/models/test_flash_gemma3.py index 1df85b345..0a14747d2 100644 --- a/integration-tests/models/test_flash_gemma3.py +++ b/integration-tests/models/test_flash_gemma3.py @@ -1,3 +1,7 @@ +import base64 +from io import BytesIO +from PIL import Image + import pytest @@ -91,3 +95,76 @@ async def test_exceed_window(flash_gemma3, response_snapshot): ) assert response.details.generated_tokens == 16 assert response == response_snapshot + + +# Helper function to convert a Pillow image to a base64 data URL +def image_to_data_url(img: Image.Image, fmt: str) -> str: + buffer = BytesIO() + img.save(buffer, format=fmt) + img_data = buffer.getvalue() + b64_str = base64.b64encode(img_data).decode("utf-8") + mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg" + return f"data:{mime_type};base64,{b64_str}" + + +async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot): + # Create an empty 100x100 PNG image with alpha (transparent background) + img = Image.new("RGBA", (100, 100), (0, 0, 0, 0)) + data_url = image_to_data_url(img, "PNG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + { + "type": "text", + "text": "What do you see in this transparent image?", + }, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot + + +async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot): + # Create an empty 100x100 PNG image without alpha (white background) + img = Image.new("RGB", (100, 100), (255, 255, 255)) + data_url = image_to_data_url(img, "PNG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + {"type": "text", "text": "What do you see in this plain image?"}, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot + + +async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot): + # Create an empty 100x100 JPEG image (white background) + img = Image.new("RGB", (100, 100), (255, 255, 255)) + data_url = image_to_data_url(img, "JPEG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + {"type": "text", "text": "What do you see in this JPEG image?"}, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot diff --git a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py index 803d81ead..2972abeab 100644 --- a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py +++ b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py @@ -263,7 +263,7 @@ class Gemma3ImageProcessor(BaseImageProcessor): return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - do_convert_rgb: bool = None, + do_convert_rgb: bool = True, do_pan_and_scan: bool = None, pan_and_scan_min_crop_size: int = None, pan_and_scan_max_num_crops: int = None,