mirror of
https://github.com/huggingface/text-generation-inference.git
synced 2025-04-21 06:42:10 +00:00
feat: add tests for image types and remove alpha from png
This commit is contained in:
parent
e5ec176bf4
commit
659ce4f3fc
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white canvas or a completely white square. \n\nIs there anything specific you'd like me to do with this image, such as describe it further or imagine what it might represent?",
|
||||||
|
"name": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": null
|
||||||
|
},
|
||||||
|
"usage": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1741965894,
|
||||||
|
"id": "",
|
||||||
|
"model": "google/gemma-3-4b-it",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"system_fingerprint": "3.2.1-dev0-native",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 74,
|
||||||
|
"prompt_tokens": 277,
|
||||||
|
"total_tokens": 351
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?",
|
||||||
|
"name": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": null
|
||||||
|
},
|
||||||
|
"usage": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1741965892,
|
||||||
|
"id": "",
|
||||||
|
"model": "google/gemma-3-4b-it",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"system_fingerprint": "3.2.1-dev0-native",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 98,
|
||||||
|
"prompt_tokens": 277,
|
||||||
|
"total_tokens": 375
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": null,
|
||||||
|
"message": {
|
||||||
|
"content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?",
|
||||||
|
"name": null,
|
||||||
|
"role": "assistant",
|
||||||
|
"tool_calls": null
|
||||||
|
},
|
||||||
|
"usage": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"created": 1741966313,
|
||||||
|
"id": "",
|
||||||
|
"model": "google/gemma-3-4b-it",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"system_fingerprint": "3.2.1-dev0-native",
|
||||||
|
"usage": {
|
||||||
|
"completion_tokens": 67,
|
||||||
|
"prompt_tokens": 277,
|
||||||
|
"total_tokens": 344
|
||||||
|
}
|
||||||
|
}
|
@ -1,3 +1,7 @@
|
|||||||
|
import base64
|
||||||
|
from io import BytesIO
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@ -91,3 +95,76 @@ async def test_exceed_window(flash_gemma3, response_snapshot):
|
|||||||
)
|
)
|
||||||
assert response.details.generated_tokens == 16
|
assert response.details.generated_tokens == 16
|
||||||
assert response == response_snapshot
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
# Helper function to convert a Pillow image to a base64 data URL
|
||||||
|
def image_to_data_url(img: Image.Image, fmt: str) -> str:
|
||||||
|
buffer = BytesIO()
|
||||||
|
img.save(buffer, format=fmt)
|
||||||
|
img_data = buffer.getvalue()
|
||||||
|
b64_str = base64.b64encode(img_data).decode("utf-8")
|
||||||
|
mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
|
||||||
|
return f"data:{mime_type};base64,{b64_str}"
|
||||||
|
|
||||||
|
|
||||||
|
async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot):
|
||||||
|
# Create an empty 100x100 PNG image with alpha (transparent background)
|
||||||
|
img = Image.new("RGBA", (100, 100), (0, 0, 0, 0))
|
||||||
|
data_url = image_to_data_url(img, "PNG")
|
||||||
|
response = await flash_gemma3.chat(
|
||||||
|
seed=42,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": data_url}},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What do you see in this transparent image?",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot):
|
||||||
|
# Create an empty 100x100 PNG image without alpha (white background)
|
||||||
|
img = Image.new("RGB", (100, 100), (255, 255, 255))
|
||||||
|
data_url = image_to_data_url(img, "PNG")
|
||||||
|
response = await flash_gemma3.chat(
|
||||||
|
seed=42,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": data_url}},
|
||||||
|
{"type": "text", "text": "What do you see in this plain image?"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
assert response == response_snapshot
|
||||||
|
|
||||||
|
|
||||||
|
async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot):
|
||||||
|
# Create an empty 100x100 JPEG image (white background)
|
||||||
|
img = Image.new("RGB", (100, 100), (255, 255, 255))
|
||||||
|
data_url = image_to_data_url(img, "JPEG")
|
||||||
|
response = await flash_gemma3.chat(
|
||||||
|
seed=42,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "image_url", "image_url": {"url": data_url}},
|
||||||
|
{"type": "text", "text": "What do you see in this JPEG image?"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
max_tokens=100,
|
||||||
|
)
|
||||||
|
assert response == response_snapshot
|
||||||
|
@ -263,7 +263,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
|
|||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
do_convert_rgb: bool = None,
|
do_convert_rgb: bool = True,
|
||||||
do_pan_and_scan: bool = None,
|
do_pan_and_scan: bool = None,
|
||||||
pan_and_scan_min_crop_size: int = None,
|
pan_and_scan_min_crop_size: int = None,
|
||||||
pan_and_scan_max_num_crops: int = None,
|
pan_and_scan_max_num_crops: int = None,
|
||||||
|
Loading…
Reference in New Issue
Block a user