feat: add tests for image types and remove alpha from png

2025-07-02 22:10:17 +00:00 · 2025-03-14 15:33:06 +00:00 · 2025-03-14 15:33:06 +00:00 · 659ce4f3fc
commit 659ce4f3fc
parent e5ec176bf4
5 changed files with 156 additions and 1 deletions
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json
@ -0,0 +1,26 @@
 {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white canvas or a completely white square. \n\nIs there anything specific you'd like me to do with this image, such as describe it further or imagine what it might represent?",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1741965894,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 74,
    "prompt_tokens": 277,
    "total_tokens": 351
  }
 }
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json
@ -0,0 +1,26 @@
 {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1741965892,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 98,
    "prompt_tokens": 277,
    "total_tokens": 375
  }
 }
--- a/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
+++ b/integration-tests/models/snapshots/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json
@ -0,0 +1,26 @@
 {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?",
        "name": null,
        "role": "assistant",
        "tool_calls": null
      },
      "usage": null
    }
  ],
  "created": 1741966313,
  "id": "",
  "model": "google/gemma-3-4b-it",
  "object": "chat.completion",
  "system_fingerprint": "3.2.1-dev0-native",
  "usage": {
    "completion_tokens": 67,
    "prompt_tokens": 277,
    "total_tokens": 344
  }
 }
--- a/integration-tests/models/test_flash_gemma3.py
+++ b/integration-tests/models/test_flash_gemma3.py
@ -1,3 +1,7 @@
 import base64
 from io import BytesIO
 from PIL import Image
 import pytest
@ -91,3 +95,76 @@ async def test_exceed_window(flash_gemma3, response_snapshot):
    )
    assert response.details.generated_tokens == 16
    assert response == response_snapshot
 # Helper function to convert a Pillow image to a base64 data URL
 def image_to_data_url(img: Image.Image, fmt: str) -> str:
    buffer = BytesIO()
    img.save(buffer, format=fmt)
    img_data = buffer.getvalue()
    b64_str = base64.b64encode(img_data).decode("utf-8")
    mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
    return f"data:{mime_type};base64,{b64_str}"
 async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot):
    # Create an empty 100x100 PNG image with alpha (transparent background)
    img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
    data_url = image_to_data_url(img, "PNG")
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {
                        "type": "text",
                        "text": "What do you see in this transparent image?",
                    },
                ],
            },
        ],
        max_tokens=100,
    )
    assert response == response_snapshot
 async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot):
    # Create an empty 100x100 PNG image without alpha (white background)
    img = Image.new("RGB", (100, 100), (255, 255, 255))
    data_url = image_to_data_url(img, "PNG")
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {"type": "text", "text": "What do you see in this plain image?"},
                ],
            },
        ],
        max_tokens=100,
    )
    assert response == response_snapshot
 async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot):
    # Create an empty 100x100 JPEG image (white background)
    img = Image.new("RGB", (100, 100), (255, 255, 255))
    data_url = image_to_data_url(img, "JPEG")
    response = await flash_gemma3.chat(
        seed=42,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": data_url}},
                    {"type": "text", "text": "What do you see in this JPEG image?"},
                ],
            },
        ],
        max_tokens=100,
    )
    assert response == response_snapshot
--- a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
+++ b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py
@ -263,7 +263,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        do_convert_rgb: bool = None,
+        do_convert_rgb: bool = True,
        do_pan_and_scan: bool = None,
        pan_and_scan_min_crop_size: int = None,
        pan_and_scan_max_num_crops: int = None,