diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 453b4f61a..f168fd76b 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -14,8 +14,8 @@ Text Generation Inference enables serving optimized models. The following sectio - [Gemma](https://huggingface.co/google/gemma-7b) - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224) - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) -- [Gemma3](https://huggingface.co/collections/google/gemma-3) -- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3) +- [Gemma3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) +- [Gemma3 Text](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus) - [Dbrx](https://huggingface.co/databricks/dbrx-instruct) - [Mamba](https://huggingface.co/state-spaces/mamba-2.8b-slimpj) diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json index ec8cd4f62..5c6b4cb94 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_exceed_window.json @@ -1,133 +1,109 @@ { "details": { "best_of_sequences": null, - "finish_reason": "length", - "generated_tokens": 20, + "finish_reason": "eos_token", + "generated_tokens": 16, "prefill": [], "seed": null, "tokens": [ + { + "id": 506, + "logprob": -1.3984375, + "special": false, + "text": " the" + }, + { + "id": 1331, + "logprob": -1.6953125, + "special": false, + "text": " people" + }, { "id": 236764, - "logprob": -0.44726562, + "logprob": -0.23535156, "special": false, "text": "," }, { - "id": 236743, - "logprob": -0.011413574, + "id": 532, + "logprob": -0.24316406, "special": false, - "text": " " + "text": " and" }, { - "id": 236812, - "logprob": -0.09814453, + "id": 506, + "logprob": -0.12109375, "special": false, - "text": "4" + "text": " the" }, { - "id": 236764, - "logprob": -0.044189453, + "id": 2780, + "logprob": -1.1640625, "special": false, - "text": "," + "text": " food" }, { - "id": 236743, - "logprob": -0.15625, + "id": 236761, + "logprob": -0.21386719, "special": false, - "text": " " + "text": "." }, { - "id": 236810, - "logprob": -0.010864258, + "id": 108, + "logprob": -0.64453125, "special": false, - "text": "5" + "text": "\n\n" }, { - "id": 236764, - "logprob": -0.040039062, + "id": 2094, + "logprob": -0.77734375, "special": false, - "text": "," + "text": "This" }, { - "id": 236743, - "logprob": -0.26757812, + "id": 563, + "logprob": -0.040283203, "special": false, - "text": " " + "text": " is" }, { - "id": 236825, - "logprob": -0.0047302246, + "id": 496, + "logprob": -0.03125, "special": false, - "text": "6" + "text": " a" }, { - "id": 236764, - "logprob": -0.026123047, + "id": 6290, + "logprob": -0.03515625, "special": false, - "text": "," + "text": " nice" }, { - "id": 236743, - "logprob": -0.265625, + "id": 1977, + "logprob": -0.0020751953, "special": false, - "text": " " + "text": " place" }, { - "id": 236832, - "logprob": -0.014160156, + "id": 236761, + "logprob": -0.0079956055, "special": false, - "text": "7" + "text": "." }, { - "id": 236764, - "logprob": -0.013977051, + "id": 107, + "logprob": -0.9921875, "special": false, - "text": "," + "text": "\n" }, { - "id": 236743, - "logprob": -0.103515625, - "special": false, - "text": " " - }, - { - "id": 236828, - "logprob": -0.008178711, - "special": false, - "text": "8" - }, - { - "id": 236764, - "logprob": -0.030151367, - "special": false, - "text": "," - }, - { - "id": 236743, - "logprob": -0.39453125, - "special": false, - "text": " " - }, - { - "id": 236819, - "logprob": -0.008728027, - "special": false, - "text": "9" - }, - { - "id": 236764, - "logprob": -0.020629883, - "special": false, - "text": "," - }, - { - "id": 236743, - "logprob": -0.08154297, - "special": false, - "text": " " + "id": 106, + "logprob": -0.45507812, + "special": true, + "text": "" } ], "top_tokens": null }, - "generated_text": ", 4, 5, 6, 7, 8, 9, " + "generated_text": " the people, and the food.\n\nThis is a nice place.\n" } diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json index 1324555aa..859544c89 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3.json @@ -8,31 +8,31 @@ "tokens": [ { "id": 1331, - "logprob": -0.32421875, + "logprob": -0.34960938, "special": false, "text": " people" }, { "id": 8390, - "logprob": -0.15332031, + "logprob": -0.14746094, "special": false, "text": " died" }, { "id": 528, - "logprob": -1.140625, + "logprob": -1.2265625, "special": false, "text": " in" }, { "id": 506, - "logprob": -0.42578125, + "logprob": -0.47070312, "special": false, "text": " the" }, { "id": 3640, - "logprob": -0.64453125, + "logprob": -0.5859375, "special": false, "text": " United" }, @@ -44,31 +44,31 @@ }, { "id": 236761, - "logprob": -0.37890625, + "logprob": -0.34765625, "special": false, "text": "." }, { "id": 108, - "logprob": -0.08300781, + "logprob": -0.0859375, "special": false, "text": "\n\n" }, { "id": 818, - "logprob": -1.1796875, + "logprob": -1.1640625, "special": false, "text": "The" }, { "id": 6816, - "logprob": -1.765625, + "logprob": -1.890625, "special": false, "text": " generally" }, { "id": 10951, - "logprob": -0.14550781, + "logprob": -0.14648438, "special": false, "text": " accepted" }, @@ -86,49 +86,49 @@ }, { "id": 600, - "logprob": -0.65625, + "logprob": -0.65234375, "special": false, "text": " that" }, { "id": 236743, - "logprob": -1.1796875, + "logprob": -1.2109375, "special": false, "text": " " }, { "id": 236825, - "logprob": -0.0009918213, + "logprob": -0.00088119507, "special": false, "text": "6" }, { "id": 236832, - "logprob": -6.532669e-05, + "logprob": -6.580353e-05, "special": false, "text": "7" }, { "id": 236810, - "logprob": -4.863739e-05, + "logprob": -5.2690506e-05, "special": false, "text": "5" }, { "id": 236764, - "logprob": -0.00017929077, + "logprob": -0.0001745224, "special": false, "text": "," }, { "id": 236771, - "logprob": -1.2397766e-05, + "logprob": -1.180172e-05, "special": false, "text": "0" }, { "id": 236771, - "logprob": -2.1457672e-06, + "logprob": -1.7881393e-06, "special": false, "text": "0" }, @@ -140,7 +140,7 @@ }, { "id": 1331, - "logprob": -0.50390625, + "logprob": -0.44921875, "special": false, "text": " people" }, @@ -152,67 +152,67 @@ }, { "id": 528, - "logprob": -0.08496094, + "logprob": -0.084472656, "special": false, "text": " in" }, { "id": 506, - "logprob": -0.0003299713, + "logprob": -0.00034713745, "special": false, "text": " the" }, { "id": 3640, - "logprob": -0.028442383, + "logprob": -0.028564453, "special": false, "text": " United" }, { "id": 4184, - "logprob": -0.00011014938, + "logprob": -0.00012207031, "special": false, "text": " States" }, { "id": 236761, - "logprob": -1.1796875, + "logprob": -1.15625, "special": false, "text": "." }, { "id": 3153, - "logprob": -0.104003906, + "logprob": -0.103027344, "special": false, "text": " However" }, { "id": 236764, - "logprob": -0.009094238, + "logprob": -0.009155273, "special": false, "text": "," }, { "id": 1070, - "logprob": -0.88671875, + "logprob": -0.92578125, "special": false, "text": " some" }, { "id": 61806, - "logprob": -0.84765625, + "logprob": -0.91796875, "special": false, "text": " historians" }, { "id": 4646, - "logprob": -1.34375, + "logprob": -1.3828125, "special": false, "text": " believe" }, { "id": 506, - "logprob": -0.59375, + "logprob": -0.65234375, "special": false, "text": " the" }, @@ -230,7 +230,7 @@ }, { "id": 1451, - "logprob": -0.60546875, + "logprob": -0.66015625, "special": false, "text": " could" }, @@ -242,73 +242,73 @@ }, { "id": 618, - "logprob": -0.61328125, + "logprob": -0.57421875, "special": false, "text": " as" }, { "id": 1494, - "logprob": -0.00033569336, + "logprob": -0.00036239624, "special": false, "text": " high" }, { "id": 618, - "logprob": -0.0001411438, + "logprob": -0.0001335144, "special": false, "text": " as" }, { "id": 236743, - "logprob": -0.001045227, + "logprob": -0.0009689331, "special": false, "text": " " }, { "id": 236770, - "logprob": -0.21289062, + "logprob": -0.26367188, "special": false, "text": "1" }, { "id": 236771, - "logprob": -0.13378906, + "logprob": -0.17773438, "special": false, "text": "0" }, { "id": 3625, - "logprob": -0.0087890625, + "logprob": -0.012084961, "special": false, "text": " million" }, { "id": 236761, - "logprob": -0.2109375, + "logprob": -0.21289062, "special": false, "text": "." }, { "id": 108, - "logprob": -0.39453125, + "logprob": -0.37304688, "special": false, "text": "\n\n" }, { "id": 236777, - "logprob": -1.1328125, + "logprob": -1.078125, "special": false, "text": "I" }, { "id": 1006, - "logprob": -1.4140625, + "logprob": -1.3203125, "special": false, "text": " am" }, { "id": 3182, - "logprob": -1.15625, + "logprob": -1.078125, "special": false, "text": " looking" }, @@ -320,13 +320,13 @@ }, { "id": 919, - "logprob": -1.2734375, + "logprob": -1.25, "special": false, "text": " more" }, { "id": 1938, - "logprob": -1.2265625, + "logprob": -1.2421875, "special": false, "text": " information" }, @@ -338,169 +338,169 @@ }, { "id": 672, - "logprob": -0.77734375, + "logprob": -0.73046875, "special": false, "text": " this" }, { "id": 59725, - "logprob": -0.70703125, + "logprob": -0.75, "special": false, "text": " discrepancy" }, { "id": 532, - "logprob": -0.8515625, + "logprob": -0.83984375, "special": false, "text": " and" }, { "id": 506, - "logprob": -0.65625, + "logprob": -0.7109375, "special": false, "text": " the" }, { "id": 5872, - "logprob": -1.15625, + "logprob": -1.2734375, "special": false, "text": " factors" }, { "id": 600, - "logprob": -0.2265625, + "logprob": -0.22851562, "special": false, "text": " that" }, { "id": 19263, - "logprob": -1.125, + "logprob": -1.1640625, "special": false, "text": " contributed" }, { "id": 531, - "logprob": -0.001083374, + "logprob": -0.0010757446, "special": false, "text": " to" }, { "id": 506, - "logprob": -0.2109375, + "logprob": -0.18945312, "special": false, "text": " the" }, { "id": 5777, - "logprob": -1.21875, + "logprob": -1.2734375, "special": false, "text": " wide" }, { "id": 2644, - "logprob": -0.018310547, + "logprob": -0.01940918, "special": false, "text": " range" }, { "id": 529, - "logprob": -0.12988281, + "logprob": -0.14550781, "special": false, "text": " of" }, { "id": 14287, - "logprob": -0.03564453, + "logprob": -0.032470703, "special": false, "text": " estimates" }, { "id": 236761, - "logprob": -0.010314941, + "logprob": -0.010375977, "special": false, "text": "." }, { "id": 108, - "logprob": -0.060546875, + "logprob": -0.06591797, "special": false, "text": "\n\n" }, { "id": 8291, - "logprob": -0.734375, + "logprob": -0.8046875, "special": false, "text": "Here" }, { "id": 236789, - "logprob": -0.26367188, + "logprob": -0.23828125, "special": false, "text": "'" }, { "id": 236751, - "logprob": -1.1920929e-06, + "logprob": -1.0728836e-06, "special": false, "text": "s" }, { "id": 496, - "logprob": -0.15527344, + "logprob": -0.17480469, "special": false, "text": " a" }, { "id": 25890, - "logprob": -0.08886719, + "logprob": -0.087402344, "special": false, "text": " breakdown" }, { "id": 529, - "logprob": -0.0020446777, + "logprob": -0.0021209717, "special": false, "text": " of" }, { "id": 506, - "logprob": -0.17871094, + "logprob": -0.19140625, "special": false, "text": " the" }, { "id": 5872, - "logprob": -0.90234375, + "logprob": -1.0078125, "special": false, "text": " factors" }, { "id": 20894, - "logprob": -0.25976562, + "logprob": -0.26367188, "special": false, "text": " contributing" }, { "id": 531, - "logprob": -8.34465e-05, + "logprob": -9.250641e-05, "special": false, "text": " to" }, { "id": 506, - "logprob": -0.008544922, + "logprob": -0.008666992, "special": false, "text": " the" }, { "id": 5777, - "logprob": -0.62109375, + "logprob": -0.6171875, "special": false, "text": " wide" }, { "id": 2644, - "logprob": -0.0023345947, + "logprob": -0.0023956299, "special": false, "text": " range" }, @@ -512,25 +512,25 @@ }, { "id": 14287, - "logprob": -0.011291504, + "logprob": -0.011352539, "special": false, "text": " estimates" }, { "id": 573, - "logprob": -0.29101562, + "logprob": -0.30664062, "special": false, "text": " for" }, { "id": 506, - "logprob": -0.21484375, + "logprob": -0.21386719, "special": false, "text": " the" }, { "id": 236743, - "logprob": -0.2890625, + "logprob": -0.35351562, "special": false, "text": " " }, @@ -566,19 +566,19 @@ }, { "id": 10248, - "logprob": -0.01953125, + "logprob": -0.015258789, "special": false, "text": " pandemic" }, { "id": 4355, - "logprob": -0.78515625, + "logprob": -0.83203125, "special": false, "text": " death" }, { "id": 25363, - "logprob": -6.771088e-05, + "logprob": -7.43866e-05, "special": false, "text": " toll" }, @@ -590,13 +590,13 @@ }, { "id": 506, - "logprob": -7.033348e-06, + "logprob": -6.67572e-06, "special": false, "text": " the" }, { "id": 3640, - "logprob": -0.0067443848, + "logprob": -0.0059509277, "special": false, "text": " United" }, diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json new file mode 100644 index 000000000..ae67e0060 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_jpg.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image.\n\nThe image is a solid, bright white color. There is nothing else visible within it. \n\nIt's essentially a blank white canvas or a completely white square. \n\nIs there anything specific you'd like me to do with this image, such as describe it further or imagine what it might represent?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741965894, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 74, + "prompt_tokens": 277, + "total_tokens": 351 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json new file mode 100644 index 000000000..afbfba30a --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgb_png.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image. \n\nThe image is entirely white, with a very subtle, faint outline of a stylized, cartoonish figure. It appears to be a simplified depiction of a person, likely a child, with a wide-eyed expression and a small, rounded body. \n\nIt's almost like a minimalist, iconic representation. \n\nDo you want me to try and describe it in more detail or perhaps speculate about the context of the image?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741965892, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 98, + "prompt_tokens": 277, + "total_tokens": 375 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json new file mode 100644 index 000000000..1b97d2615 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_base64_rgba.json @@ -0,0 +1,26 @@ +{ + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": null, + "message": { + "content": "Okay, let's analyze the image. \n\nThe transparent image reveals a stylized depiction of **a human head**. It's a minimalist, geometric representation, showing the basic shapes of the skull, eye sockets, and head outline. \n\nDo you want me to describe any specific element of the image in more detail?", + "name": null, + "role": "assistant", + "tool_calls": null + }, + "usage": null + } + ], + "created": 1741966313, + "id": "", + "model": "google/gemma-3-4b-it", + "object": "chat.completion", + "system_fingerprint": "3.2.1-dev0-native", + "usage": { + "completion_tokens": 67, + "prompt_tokens": 277, + "total_tokens": 344 + } +} diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json index 6c30ada41..cd786b3ce 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow.json @@ -5,7 +5,7 @@ "index": 0, "logprobs": null, "message": { - "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a humorous and unexpected sight of a cow enjoying a tropical beach!", + "content": "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a day at the beach!", "name": null, "role": "assistant", "tool_calls": null @@ -13,14 +13,14 @@ "usage": null } ], - "created": 1741703756, + "created": 1741964480, "id": "", - "model": "gg-hf-g/gemma-3-4b-it", + "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.1.2-dev0-native", + "system_fingerprint": "3.2.1-dev0-native", "usage": { - "completion_tokens": 70, - "prompt_tokens": 277, - "total_tokens": 347 + "completion_tokens": 74, + "prompt_tokens": 275, + "total_tokens": 349 } } diff --git a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json index fe67c9954..5ed2c4507 100644 --- a/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json +++ b/integration-tests/models/__snapshots__/test_flash_gemma3/test_flash_gemma3_image_cow_dog.json @@ -5,7 +5,7 @@ "index": 0, "logprobs": null, "message": { - "content": "Based on the image, the animal is a cow, not a dog! \n\nIt appears to be a **Brazilian cattle breed** known as a **Gir Cow**. They are recognized for their reddish-brown color and distinctive markings.", + "content": "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their reddish-brown color and distinctive white markings. \n\nIf you'd like, you can send me another image and I’ll do my best to identify it!", "name": null, "role": "assistant", "tool_calls": null @@ -13,14 +13,14 @@ "usage": null } ], - "created": 1741703753, + "created": 1741964477, "id": "", - "model": "gg-hf-g/gemma-3-4b-it", + "model": "google/gemma-3-4b-it", "object": "chat.completion", - "system_fingerprint": "3.1.2-dev0-native", + "system_fingerprint": "3.2.1-dev0-native", "usage": { - "completion_tokens": 48, - "prompt_tokens": 281, - "total_tokens": 329 + "completion_tokens": 75, + "prompt_tokens": 279, + "total_tokens": 354 } } diff --git a/integration-tests/models/test_flash_gemma3.py b/integration-tests/models/test_flash_gemma3.py index ab812d644..5064f34d5 100644 --- a/integration-tests/models/test_flash_gemma3.py +++ b/integration-tests/models/test_flash_gemma3.py @@ -1,3 +1,7 @@ +import base64 +from io import BytesIO +from PIL import Image + import pytest @@ -49,9 +53,9 @@ async def test_flash_gemma3_image_cow_dog(flash_gemma3, response_snapshot): assert ( response.choices[0].message.content - == "Based on the image, the animal is a cow, not a dog! \n\nIt appears to be a **Brazilian cattle breed** known as a **Gir Cow**. They are recognized for their reddish-brown color and distinctive markings." + == "That's a fantastic question! However, the image doesn't show a dog. It shows a **Brown Swiss cow** standing on a beach. \n\nBrown Swiss cows are known for their reddish-brown color and distinctive white markings. \n\nIf you'd like, you can send me another image and I’ll do my best to identify it!" ) - assert response.usage["completion_tokens"] == 48 + assert response.usage["completion_tokens"] == 75 assert response == response_snapshot @@ -72,19 +76,95 @@ async def test_flash_gemma3_image_cow(flash_gemma3, response_snapshot): ) assert ( response.choices[0].message.content - == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a humorous and unexpected sight of a cow enjoying a tropical beach!" + == "Here's a description of what's shown in the image:\n\nThe image depicts a brown cow standing on a sandy beach. The beach has turquoise water and a distant island visible in the background. The sky is bright blue with some white clouds. \n\nIt's a quite a humorous and unusual scene – a cow enjoying a day at the beach!" ) - assert response.usage["completion_tokens"] == 70 + assert response.usage["completion_tokens"] == 74 assert response == response_snapshot async def test_exceed_window(flash_gemma3, response_snapshot): response = await flash_gemma3.generate( - "This is a nice place. " * 800 + "Now count: 1, 2, 3", + "This is a nice place. " * 800 + "I really enjoy the scenery,", seed=42, max_new_tokens=20, ) - assert response.generated_text == ", 4, 5, 6, 7, 8, 9, " - assert response.details.generated_tokens == 20 + assert ( + response.generated_text + == " the people, and the food.\n\nThis is a nice place.\n" + ) + assert response.details.generated_tokens == 16 + assert response == response_snapshot + + +# Helper function to convert a Pillow image to a base64 data URL +def image_to_data_url(img: Image.Image, fmt: str) -> str: + buffer = BytesIO() + img.save(buffer, format=fmt) + img_data = buffer.getvalue() + b64_str = base64.b64encode(img_data).decode("utf-8") + mime_type = "image/png" if fmt.upper() == "PNG" else "image/jpeg" + return f"data:{mime_type};base64,{b64_str}" + + +async def test_flash_gemma3_image_base64_rgba(flash_gemma3, response_snapshot): + # Create an empty 100x100 PNG image with alpha (transparent background) + img = Image.new("RGBA", (100, 100), (0, 0, 0, 0)) + data_url = image_to_data_url(img, "PNG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + { + "type": "text", + "text": "What do you see in this transparent image?", + }, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot + + +async def test_flash_gemma3_image_base64_rgb_png(flash_gemma3, response_snapshot): + # Create an empty 100x100 PNG image without alpha (white background) + img = Image.new("RGB", (100, 100), (255, 255, 255)) + data_url = image_to_data_url(img, "PNG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + {"type": "text", "text": "What do you see in this plain image?"}, + ], + }, + ], + max_tokens=100, + ) + assert response == response_snapshot + + +async def test_flash_gemma3_image_base64_rgb_jpg(flash_gemma3, response_snapshot): + # Create an empty 100x100 JPEG image (white background) + img = Image.new("RGB", (100, 100), (255, 255, 255)) + data_url = image_to_data_url(img, "JPEG") + response = await flash_gemma3.chat( + seed=42, + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_url}}, + {"type": "text", "text": "What do you see in this JPEG image?"}, + ], + }, + ], + max_tokens=100, + ) assert response == response_snapshot diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index 07aa43073..abe8cfeeb 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "numpy>=2.0", "openai>=1.65", "huggingface_hub>=0.29", + "pillow>=11.1.0", ] [tool.isort] diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt index a85db4a5b..ca2dee938 100644 --- a/integration-tests/requirements.txt +++ b/integration-tests/requirements.txt @@ -1,8 +1,8 @@ # This file was autogenerated by uv via the following command: -# uv pip compile pyproject.toml -o requirements.txt -aiohappyeyeballs==2.4.6 +# uv pip compile pyproject.toml +aiohappyeyeballs==2.6.1 # via aiohttp -aiohttp==3.11.12 +aiohttp==3.11.13 # via text-generation aiosignal==1.3.2 # via aiohttp @@ -12,7 +12,7 @@ anyio==4.8.0 # via # httpx # openai -attrs==25.1.0 +attrs==25.3.0 # via aiohttp certifi==2025.1.31 # via @@ -25,13 +25,13 @@ distro==1.9.0 # via openai docker==7.1.0 # via text-generation-integration-tests (pyproject.toml) -filelock==3.17.0 +filelock==3.18.0 # via huggingface-hub frozenlist==1.5.0 # via # aiohttp # aiosignal -fsspec==2025.2.0 +fsspec==2025.3.0 # via huggingface-hub h11==0.14.0 # via httpcore @@ -39,7 +39,7 @@ httpcore==1.0.7 # via httpx httpx==0.28.1 # via openai -huggingface-hub==0.29.0 +huggingface-hub==0.29.3 # via # text-generation-integration-tests (pyproject.toml) # text-generation @@ -51,7 +51,7 @@ idna==3.10 # yarl iniconfig==2.0.0 # via pytest -jiter==0.8.2 +jiter==0.9.0 # via openai multidict==6.1.0 # via @@ -59,15 +59,17 @@ multidict==6.1.0 # yarl numpy==2.2.3 # via text-generation-integration-tests (pyproject.toml) -openai==1.65.3 +openai==1.66.3 # via text-generation-integration-tests (pyproject.toml) packaging==24.2 # via # huggingface-hub # pytest +pillow==11.1.0 + # via text-generation-integration-tests (pyproject.toml) pluggy==1.5.0 # via pytest -propcache==0.2.1 +propcache==0.3.0 # via # aiohttp # yarl @@ -78,7 +80,7 @@ pydantic==2.10.6 # text-generation pydantic-core==2.27.2 # via pydantic -pytest==8.3.4 +pytest==8.3.5 # via # text-generation-integration-tests (pyproject.toml) # pytest-asyncio @@ -95,7 +97,7 @@ sniffio==1.3.1 # via # anyio # openai -syrupy==4.8.1 +syrupy==4.9.0 # via text-generation-integration-tests (pyproject.toml) text-generation==0.7.0 # via text-generation-integration-tests (pyproject.toml) diff --git a/integration-tests/uv.lock b/integration-tests/uv.lock index 9f3765b87..bad6aa8f2 100644 --- a/integration-tests/uv.lock +++ b/integration-tests/uv.lock @@ -97,6 +97,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, ] +[[package]] +name = "anyio" +version = "4.8.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "idna" }, + { name = "sniffio" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/73/199a98fc2dae33535d6b8e8e6ec01f8c1d76c9adb096c6b7d64823038cde/anyio-4.8.0.tar.gz", hash = "sha256:1d9fe889df5212298c0c0723fa20479d1b94883a2df44bd3897aa91083316f7a", size = 181126 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/eb/e7f063ad1fec6b3178a3cd82d1a3c4de82cccf283fc42746168188e1cdd5/anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a", size = 96041 }, +] + [[package]] name = "async-timeout" version = "5.0.1" @@ -181,6 +196,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277 }, +] + [[package]] name = "docker" version = "7.1.0" @@ -276,6 +300,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 }, ] +[[package]] +name = "h11" +version = "0.14.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, +] + +[[package]] +name = "httpcore" +version = "1.0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/41/d7d0a89eb493922c37d343b607bc1b5da7f5be7e383740b4753ad8943e90/httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c", size = 85196 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/f5/72347bc88306acb359581ac4d52f23c0ef445b57157adedb9aee0cd689d2/httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd", size = 78551 }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 }, +] + [[package]] name = "huggingface-hub" version = "0.29.0" @@ -312,6 +373,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] +[[package]] +name = "jiter" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/c2/e4562507f52f0af7036da125bb699602ead37a2332af0788f8e0a3417f36/jiter-0.9.0.tar.gz", hash = "sha256:aadba0964deb424daa24492abc3d229c60c4a31bfee205aedbf1acc7639d7893", size = 162604 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b0/82/39f7c9e67b3b0121f02a0b90d433626caa95a565c3d2449fea6bcfa3f5f5/jiter-0.9.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:816ec9b60fdfd1fec87da1d7ed46c66c44ffec37ab2ef7de5b147b2fce3fd5ad", size = 314540 }, + { url = "https://files.pythonhosted.org/packages/01/07/7bf6022c5a152fca767cf5c086bb41f7c28f70cf33ad259d023b53c0b858/jiter-0.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9b1d3086f8a3ee0194ecf2008cf81286a5c3e540d977fa038ff23576c023c0ea", size = 321065 }, + { url = "https://files.pythonhosted.org/packages/6c/b2/de3f3446ecba7c48f317568e111cc112613da36c7b29a6de45a1df365556/jiter-0.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1339f839b91ae30b37c409bf16ccd3dc453e8b8c3ed4bd1d6a567193651a4a51", size = 341664 }, + { url = "https://files.pythonhosted.org/packages/13/cf/6485a4012af5d407689c91296105fcdb080a3538e0658d2abf679619c72f/jiter-0.9.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ffba79584b3b670fefae66ceb3a28822365d25b7bf811e030609a3d5b876f538", size = 364635 }, + { url = "https://files.pythonhosted.org/packages/0d/f7/4a491c568f005553240b486f8e05c82547340572d5018ef79414b4449327/jiter-0.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cfc7d0a8e899089d11f065e289cb5b2daf3d82fbe028f49b20d7b809193958d", size = 406288 }, + { url = "https://files.pythonhosted.org/packages/d3/ca/f4263ecbce7f5e6bded8f52a9f1a66540b270c300b5c9f5353d163f9ac61/jiter-0.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e00a1a2bbfaaf237e13c3d1592356eab3e9015d7efd59359ac8b51eb56390a12", size = 397499 }, + { url = "https://files.pythonhosted.org/packages/ac/a2/522039e522a10bac2f2194f50e183a49a360d5f63ebf46f6d890ef8aa3f9/jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1d9870561eb26b11448854dce0ff27a9a27cb616b632468cafc938de25e9e51", size = 352926 }, + { url = "https://files.pythonhosted.org/packages/b1/67/306a5c5abc82f2e32bd47333a1c9799499c1c3a415f8dde19dbf876f00cb/jiter-0.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9872aeff3f21e437651df378cb75aeb7043e5297261222b6441a620218b58708", size = 384506 }, + { url = "https://files.pythonhosted.org/packages/0f/89/c12fe7b65a4fb74f6c0d7b5119576f1f16c79fc2953641f31b288fad8a04/jiter-0.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1fd19112d1049bdd47f17bfbb44a2c0001061312dcf0e72765bfa8abd4aa30e5", size = 520621 }, + { url = "https://files.pythonhosted.org/packages/c4/2b/d57900c5c06e6273fbaa76a19efa74dbc6e70c7427ab421bf0095dfe5d4a/jiter-0.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6ef5da104664e526836070e4a23b5f68dec1cc673b60bf1edb1bfbe8a55d0678", size = 512613 }, + { url = "https://files.pythonhosted.org/packages/89/05/d8b90bfb21e58097d5a4e0224f2940568366f68488a079ae77d4b2653500/jiter-0.9.0-cp310-cp310-win32.whl", hash = "sha256:cb12e6d65ebbefe5518de819f3eda53b73187b7089040b2d17f5b39001ff31c4", size = 206613 }, + { url = "https://files.pythonhosted.org/packages/2c/1d/5767f23f88e4f885090d74bbd2755518050a63040c0f59aa059947035711/jiter-0.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:c43ca669493626d8672be3b645dbb406ef25af3f4b6384cfd306da7eb2e70322", size = 208371 }, + { url = "https://files.pythonhosted.org/packages/23/44/e241a043f114299254e44d7e777ead311da400517f179665e59611ab0ee4/jiter-0.9.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6c4d99c71508912a7e556d631768dcdef43648a93660670986916b297f1c54af", size = 314654 }, + { url = "https://files.pythonhosted.org/packages/fb/1b/a7e5e42db9fa262baaa9489d8d14ca93f8663e7f164ed5e9acc9f467fc00/jiter-0.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8f60fb8ce7df529812bf6c625635a19d27f30806885139e367af93f6e734ef58", size = 320909 }, + { url = "https://files.pythonhosted.org/packages/60/bf/8ebdfce77bc04b81abf2ea316e9c03b4a866a7d739cf355eae4d6fd9f6fe/jiter-0.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51c4e1a4f8ea84d98b7b98912aa4290ac3d1eabfde8e3c34541fae30e9d1f08b", size = 341733 }, + { url = "https://files.pythonhosted.org/packages/a8/4e/754ebce77cff9ab34d1d0fa0fe98f5d42590fd33622509a3ba6ec37ff466/jiter-0.9.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f4c677c424dc76684fea3e7285a7a2a7493424bea89ac441045e6a1fb1d7b3b", size = 365097 }, + { url = "https://files.pythonhosted.org/packages/32/2c/6019587e6f5844c612ae18ca892f4cd7b3d8bbf49461ed29e384a0f13d98/jiter-0.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2221176dfec87f3470b21e6abca056e6b04ce9bff72315cb0b243ca9e835a4b5", size = 406603 }, + { url = "https://files.pythonhosted.org/packages/da/e9/c9e6546c817ab75a1a7dab6dcc698e62e375e1017113e8e983fccbd56115/jiter-0.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3c7adb66f899ffa25e3c92bfcb593391ee1947dbdd6a9a970e0d7e713237d572", size = 396625 }, + { url = "https://files.pythonhosted.org/packages/be/bd/976b458add04271ebb5a255e992bd008546ea04bb4dcadc042a16279b4b4/jiter-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98d27330fdfb77913c1097a7aab07f38ff2259048949f499c9901700789ac15", size = 351832 }, + { url = "https://files.pythonhosted.org/packages/07/51/fe59e307aaebec9265dbad44d9d4381d030947e47b0f23531579b9a7c2df/jiter-0.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eda3f8cc74df66892b1d06b5d41a71670c22d95a1ca2cbab73654745ce9d0419", size = 384590 }, + { url = "https://files.pythonhosted.org/packages/db/55/5dcd2693794d8e6f4889389ff66ef3be557a77f8aeeca8973a97a7c00557/jiter-0.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dd5ab5ddc11418dce28343123644a100f487eaccf1de27a459ab36d6cca31043", size = 520690 }, + { url = "https://files.pythonhosted.org/packages/54/d5/9f51dc90985e9eb251fbbb747ab2b13b26601f16c595a7b8baba964043bd/jiter-0.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42f8a68a69f047b310319ef8e2f52fdb2e7976fb3313ef27df495cf77bcad965", size = 512649 }, + { url = "https://files.pythonhosted.org/packages/a6/e5/4e385945179bcf128fa10ad8dca9053d717cbe09e258110e39045c881fe5/jiter-0.9.0-cp311-cp311-win32.whl", hash = "sha256:a25519efb78a42254d59326ee417d6f5161b06f5da827d94cf521fed961b1ff2", size = 206920 }, + { url = "https://files.pythonhosted.org/packages/4c/47/5e0b94c603d8e54dd1faab439b40b832c277d3b90743e7835879ab663757/jiter-0.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:923b54afdd697dfd00d368b7ccad008cccfeb1efb4e621f32860c75e9f25edbd", size = 210119 }, + { url = "https://files.pythonhosted.org/packages/af/d7/c55086103d6f29b694ec79156242304adf521577530d9031317ce5338c59/jiter-0.9.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:7b46249cfd6c48da28f89eb0be3f52d6fdb40ab88e2c66804f546674e539ec11", size = 309203 }, + { url = "https://files.pythonhosted.org/packages/b0/01/f775dfee50beb420adfd6baf58d1c4d437de41c9b666ddf127c065e5a488/jiter-0.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:609cf3c78852f1189894383cf0b0b977665f54cb38788e3e6b941fa6d982c00e", size = 319678 }, + { url = "https://files.pythonhosted.org/packages/ab/b8/09b73a793714726893e5d46d5c534a63709261af3d24444ad07885ce87cb/jiter-0.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d726a3890a54561e55a9c5faea1f7655eda7f105bd165067575ace6e65f80bb2", size = 341816 }, + { url = "https://files.pythonhosted.org/packages/35/6f/b8f89ec5398b2b0d344257138182cc090302854ed63ed9c9051e9c673441/jiter-0.9.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e89dc075c1fef8fa9be219e249f14040270dbc507df4215c324a1839522ea75", size = 364152 }, + { url = "https://files.pythonhosted.org/packages/9b/ca/978cc3183113b8e4484cc7e210a9ad3c6614396e7abd5407ea8aa1458eef/jiter-0.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:04e8ffa3c353b1bc4134f96f167a2082494351e42888dfcf06e944f2729cbe1d", size = 406991 }, + { url = "https://files.pythonhosted.org/packages/13/3a/72861883e11a36d6aa314b4922125f6ae90bdccc225cd96d24cc78a66385/jiter-0.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:203f28a72a05ae0e129b3ed1f75f56bc419d5f91dfacd057519a8bd137b00c42", size = 395824 }, + { url = "https://files.pythonhosted.org/packages/87/67/22728a86ef53589c3720225778f7c5fdb617080e3deaed58b04789418212/jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fca1a02ad60ec30bb230f65bc01f611c8608b02d269f998bc29cca8619a919dc", size = 351318 }, + { url = "https://files.pythonhosted.org/packages/69/b9/f39728e2e2007276806d7a6609cda7fac44ffa28ca0d02c49a4f397cc0d9/jiter-0.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:237e5cee4d5d2659aaf91bbf8ec45052cc217d9446070699441a91b386ae27dc", size = 384591 }, + { url = "https://files.pythonhosted.org/packages/eb/8f/8a708bc7fd87b8a5d861f1c118a995eccbe6d672fe10c9753e67362d0dd0/jiter-0.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:528b6b71745e7326eed73c53d4aa57e2a522242320b6f7d65b9c5af83cf49b6e", size = 520746 }, + { url = "https://files.pythonhosted.org/packages/95/1e/65680c7488bd2365dbd2980adaf63c562d3d41d3faac192ebc7ef5b4ae25/jiter-0.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9f48e86b57bc711eb5acdfd12b6cb580a59cc9a993f6e7dcb6d8b50522dcd50d", size = 512754 }, + { url = "https://files.pythonhosted.org/packages/78/f3/fdc43547a9ee6e93c837685da704fb6da7dba311fc022e2766d5277dfde5/jiter-0.9.0-cp312-cp312-win32.whl", hash = "sha256:699edfde481e191d81f9cf6d2211debbfe4bd92f06410e7637dffb8dd5dfde06", size = 207075 }, + { url = "https://files.pythonhosted.org/packages/cd/9d/742b289016d155f49028fe1bfbeb935c9bf0ffeefdf77daf4a63a42bb72b/jiter-0.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:099500d07b43f61d8bd780466d429c45a7b25411b334c60ca875fa775f68ccb0", size = 207999 }, +] + [[package]] name = "multidict" version = "6.1.0" @@ -411,6 +516,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 }, ] +[[package]] +name = "openai" +version = "1.66.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a3/77/5172104ca1df35ed2ed8fb26dbc787f721c39498fc51d666c4db07756a0c/openai-1.66.3.tar.gz", hash = "sha256:8dde3aebe2d081258d4159c4cb27bdc13b5bb3f7ea2201d9bd940b9a89faf0c9", size = 397244 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/5a/e20182f7b6171642d759c548daa0ba20a1d3ac10d2bd0a13fd75704a9ac3/openai-1.66.3-py3-none-any.whl", hash = "sha256:a427c920f727711877ab17c11b95f1230b27767ba7a01e5b66102945141ceca9", size = 567400 }, +] + [[package]] name = "packaging" version = "24.2" @@ -420,6 +544,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, ] +[[package]] +name = "pillow" +version = "11.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/af/c097e544e7bd278333db77933e535098c259609c4eb3b85381109602fb5b/pillow-11.1.0.tar.gz", hash = "sha256:368da70808b36d73b4b390a8ffac11069f8a5c85f29eff1f1b01bcf3ef5b2a20", size = 46742715 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/50/1c/2dcea34ac3d7bc96a1fd1bd0a6e06a57c67167fec2cff8d95d88229a8817/pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8", size = 3229983 }, + { url = "https://files.pythonhosted.org/packages/14/ca/6bec3df25e4c88432681de94a3531cc738bd85dea6c7aa6ab6f81ad8bd11/pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192", size = 3101831 }, + { url = "https://files.pythonhosted.org/packages/d4/2c/668e18e5521e46eb9667b09e501d8e07049eb5bfe39d56be0724a43117e6/pillow-11.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07dba04c5e22824816b2615ad7a7484432d7f540e6fa86af60d2de57b0fcee2", size = 4314074 }, + { url = "https://files.pythonhosted.org/packages/02/80/79f99b714f0fc25f6a8499ecfd1f810df12aec170ea1e32a4f75746051ce/pillow-11.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e267b0ed063341f3e60acd25c05200df4193e15a4a5807075cd71225a2386e26", size = 4394933 }, + { url = "https://files.pythonhosted.org/packages/81/aa/8d4ad25dc11fd10a2001d5b8a80fdc0e564ac33b293bdfe04ed387e0fd95/pillow-11.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd165131fd51697e22421d0e467997ad31621b74bfc0b75956608cb2906dda07", size = 4353349 }, + { url = "https://files.pythonhosted.org/packages/84/7a/cd0c3eaf4a28cb2a74bdd19129f7726277a7f30c4f8424cd27a62987d864/pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:abc56501c3fd148d60659aae0af6ddc149660469082859fa7b066a298bde9482", size = 4476532 }, + { url = "https://files.pythonhosted.org/packages/8f/8b/a907fdd3ae8f01c7670dfb1499c53c28e217c338b47a813af8d815e7ce97/pillow-11.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:54ce1c9a16a9561b6d6d8cb30089ab1e5eb66918cb47d457bd996ef34182922e", size = 4279789 }, + { url = "https://files.pythonhosted.org/packages/6f/9a/9f139d9e8cccd661c3efbf6898967a9a337eb2e9be2b454ba0a09533100d/pillow-11.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:73ddde795ee9b06257dac5ad42fcb07f3b9b813f8c1f7f870f402f4dc54b5269", size = 4413131 }, + { url = "https://files.pythonhosted.org/packages/a8/68/0d8d461f42a3f37432203c8e6df94da10ac8081b6d35af1c203bf3111088/pillow-11.1.0-cp310-cp310-win32.whl", hash = "sha256:3a5fe20a7b66e8135d7fd617b13272626a28278d0e578c98720d9ba4b2439d49", size = 2291213 }, + { url = "https://files.pythonhosted.org/packages/14/81/d0dff759a74ba87715509af9f6cb21fa21d93b02b3316ed43bda83664db9/pillow-11.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6123aa4a59d75f06e9dd3dac5bf8bc9aa383121bb3dd9a7a612e05eabc9961a", size = 2625725 }, + { url = "https://files.pythonhosted.org/packages/ce/1f/8d50c096a1d58ef0584ddc37e6f602828515219e9d2428e14ce50f5ecad1/pillow-11.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:a76da0a31da6fcae4210aa94fd779c65c75786bc9af06289cd1c184451ef7a65", size = 2375213 }, + { url = "https://files.pythonhosted.org/packages/dd/d6/2000bfd8d5414fb70cbbe52c8332f2283ff30ed66a9cde42716c8ecbe22c/pillow-11.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e06695e0326d05b06833b40b7ef477e475d0b1ba3a6d27da1bb48c23209bf457", size = 3229968 }, + { url = "https://files.pythonhosted.org/packages/d9/45/3fe487010dd9ce0a06adf9b8ff4f273cc0a44536e234b0fad3532a42c15b/pillow-11.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96f82000e12f23e4f29346e42702b6ed9a2f2fea34a740dd5ffffcc8c539eb35", size = 3101806 }, + { url = "https://files.pythonhosted.org/packages/e3/72/776b3629c47d9d5f1c160113158a7a7ad177688d3a1159cd3b62ded5a33a/pillow-11.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3cd561ded2cf2bbae44d4605837221b987c216cff94f49dfeed63488bb228d2", size = 4322283 }, + { url = "https://files.pythonhosted.org/packages/e4/c2/e25199e7e4e71d64eeb869f5b72c7ddec70e0a87926398785ab944d92375/pillow-11.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f189805c8be5ca5add39e6f899e6ce2ed824e65fb45f3c28cb2841911da19070", size = 4402945 }, + { url = "https://files.pythonhosted.org/packages/c1/ed/51d6136c9d5911f78632b1b86c45241c712c5a80ed7fa7f9120a5dff1eba/pillow-11.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dd0052e9db3474df30433f83a71b9b23bd9e4ef1de13d92df21a52c0303b8ab6", size = 4361228 }, + { url = "https://files.pythonhosted.org/packages/48/a4/fbfe9d5581d7b111b28f1d8c2762dee92e9821bb209af9fa83c940e507a0/pillow-11.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:837060a8599b8f5d402e97197d4924f05a2e0d68756998345c829c33186217b1", size = 4484021 }, + { url = "https://files.pythonhosted.org/packages/39/db/0b3c1a5018117f3c1d4df671fb8e47d08937f27519e8614bbe86153b65a5/pillow-11.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:aa8dd43daa836b9a8128dbe7d923423e5ad86f50a7a14dc688194b7be5c0dea2", size = 4287449 }, + { url = "https://files.pythonhosted.org/packages/d9/58/bc128da7fea8c89fc85e09f773c4901e95b5936000e6f303222490c052f3/pillow-11.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0a2f91f8a8b367e7a57c6e91cd25af510168091fb89ec5146003e424e1558a96", size = 4419972 }, + { url = "https://files.pythonhosted.org/packages/5f/bb/58f34379bde9fe197f51841c5bbe8830c28bbb6d3801f16a83b8f2ad37df/pillow-11.1.0-cp311-cp311-win32.whl", hash = "sha256:c12fc111ef090845de2bb15009372175d76ac99969bdf31e2ce9b42e4b8cd88f", size = 2291201 }, + { url = "https://files.pythonhosted.org/packages/3a/c6/fce9255272bcf0c39e15abd2f8fd8429a954cf344469eaceb9d0d1366913/pillow-11.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd43429d0d7ed6533b25fc993861b8fd512c42d04514a0dd6337fb3ccf22761", size = 2625686 }, + { url = "https://files.pythonhosted.org/packages/c8/52/8ba066d569d932365509054859f74f2a9abee273edcef5cd75e4bc3e831e/pillow-11.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f7955ecf5609dee9442cbface754f2c6e541d9e6eda87fad7f7a989b0bdb9d71", size = 2375194 }, + { url = "https://files.pythonhosted.org/packages/95/20/9ce6ed62c91c073fcaa23d216e68289e19d95fb8188b9fb7a63d36771db8/pillow-11.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2062ffb1d36544d42fcaa277b069c88b01bb7298f4efa06731a7fd6cc290b81a", size = 3226818 }, + { url = "https://files.pythonhosted.org/packages/b9/d8/f6004d98579a2596c098d1e30d10b248798cceff82d2b77aa914875bfea1/pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a85b653980faad27e88b141348707ceeef8a1186f75ecc600c395dcac19f385b", size = 3101662 }, + { url = "https://files.pythonhosted.org/packages/08/d9/892e705f90051c7a2574d9f24579c9e100c828700d78a63239676f960b74/pillow-11.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9409c080586d1f683df3f184f20e36fb647f2e0bc3988094d4fd8c9f4eb1b3b3", size = 4329317 }, + { url = "https://files.pythonhosted.org/packages/8c/aa/7f29711f26680eab0bcd3ecdd6d23ed6bce180d82e3f6380fb7ae35fcf3b/pillow-11.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fdadc077553621911f27ce206ffcbec7d3f8d7b50e0da39f10997e8e2bb7f6a", size = 4412999 }, + { url = "https://files.pythonhosted.org/packages/c8/c4/8f0fe3b9e0f7196f6d0bbb151f9fba323d72a41da068610c4c960b16632a/pillow-11.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:93a18841d09bcdd774dcdc308e4537e1f867b3dec059c131fde0327899734aa1", size = 4368819 }, + { url = "https://files.pythonhosted.org/packages/38/0d/84200ed6a871ce386ddc82904bfadc0c6b28b0c0ec78176871a4679e40b3/pillow-11.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:9aa9aeddeed452b2f616ff5507459e7bab436916ccb10961c4a382cd3e03f47f", size = 4496081 }, + { url = "https://files.pythonhosted.org/packages/84/9c/9bcd66f714d7e25b64118e3952d52841a4babc6d97b6d28e2261c52045d4/pillow-11.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3cdcdb0b896e981678eee140d882b70092dac83ac1cdf6b3a60e2216a73f2b91", size = 4296513 }, + { url = "https://files.pythonhosted.org/packages/db/61/ada2a226e22da011b45f7104c95ebda1b63dcbb0c378ad0f7c2a710f8fd2/pillow-11.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36ba10b9cb413e7c7dfa3e189aba252deee0602c86c309799da5a74009ac7a1c", size = 4431298 }, + { url = "https://files.pythonhosted.org/packages/e7/c4/fc6e86750523f367923522014b821c11ebc5ad402e659d8c9d09b3c9d70c/pillow-11.1.0-cp312-cp312-win32.whl", hash = "sha256:cfd5cd998c2e36a862d0e27b2df63237e67273f2fc78f47445b14e73a810e7e6", size = 2291630 }, + { url = "https://files.pythonhosted.org/packages/08/5c/2104299949b9d504baf3f4d35f73dbd14ef31bbd1ddc2c1b66a5b7dfda44/pillow-11.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a697cd8ba0383bba3d2d3ada02b34ed268cb548b369943cd349007730c92bddf", size = 2626369 }, + { url = "https://files.pythonhosted.org/packages/37/f3/9b18362206b244167c958984b57c7f70a0289bfb59a530dd8af5f699b910/pillow-11.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:4dd43a78897793f60766563969442020e90eb7847463eca901e41ba186a7d4a5", size = 2375240 }, + { url = "https://files.pythonhosted.org/packages/fa/c5/389961578fb677b8b3244fcd934f720ed25a148b9a5cc81c91bdf59d8588/pillow-11.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8c730dc3a83e5ac137fbc92dfcfe1511ce3b2b5d7578315b63dbbb76f7f51d90", size = 3198345 }, + { url = "https://files.pythonhosted.org/packages/c4/fa/803c0e50ffee74d4b965229e816af55276eac1d5806712de86f9371858fd/pillow-11.1.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7d33d2fae0e8b170b6a6c57400e077412240f6f5bb2a342cf1ee512a787942bb", size = 3072938 }, + { url = "https://files.pythonhosted.org/packages/dc/67/2a3a5f8012b5d8c63fe53958ba906c1b1d0482ebed5618057ef4d22f8076/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d65b38173085f24bc07f8b6c505cbb7418009fa1a1fcb111b1f4961814a442", size = 3400049 }, + { url = "https://files.pythonhosted.org/packages/e5/a0/514f0d317446c98c478d1872497eb92e7cde67003fed74f696441e647446/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83", size = 3422431 }, + { url = "https://files.pythonhosted.org/packages/cd/00/20f40a935514037b7d3f87adfc87d2c538430ea625b63b3af8c3f5578e72/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d44ff19eea13ae4acdaaab0179fa68c0c6f2f45d66a4d8ec1eda7d6cecbcc15f", size = 3446208 }, + { url = "https://files.pythonhosted.org/packages/28/3c/7de681727963043e093c72e6c3348411b0185eab3263100d4490234ba2f6/pillow-11.1.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d3d8da4a631471dfaf94c10c85f5277b1f8e42ac42bade1ac67da4b4a7359b73", size = 3509746 }, + { url = "https://files.pythonhosted.org/packages/41/67/936f9814bdd74b2dfd4822f1f7725ab5d8ff4103919a1664eb4874c58b2f/pillow-11.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4637b88343166249fe8aa94e7c4a62a180c4b3898283bb5d3d2fd5fe10d8e4e0", size = 2626353 }, +] + [[package]] name = "pluggy" version = "1.5.0" @@ -656,6 +828,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, +] + [[package]] name = "syrupy" version = "4.8.1" @@ -688,7 +869,10 @@ version = "2.0.1" source = { virtual = "." } dependencies = [ { name = "docker" }, + { name = "huggingface-hub" }, { name = "numpy" }, + { name = "openai" }, + { name = "pillow" }, { name = "pydantic" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -699,7 +883,10 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "docker", specifier = ">=7" }, + { name = "huggingface-hub", specifier = ">=0.29" }, { name = "numpy", specifier = ">=2.0" }, + { name = "openai", specifier = ">=1.65" }, + { name = "pillow", specifier = ">=11.1.0" }, { name = "pydantic", specifier = ">2,<3" }, { name = "pytest", specifier = ">=8.3.0" }, { name = "pytest-asyncio", specifier = ">=0.23.1" }, @@ -741,7 +928,7 @@ name = "tqdm" version = "4.67.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "colorama", marker = "platform_system == 'Windows'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } wheels = [ diff --git a/launcher/src/main.rs b/launcher/src/main.rs index fde1472f0..e3abb843d 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -97,11 +97,10 @@ fn get_config( let filename = if !path.exists() { // Assume it's a hub id - let mut builder = if let Ok(token) = std::env::var("HF_TOKEN") { + let mut builder = ApiBuilder::from_env(); + if let Ok(token) = std::env::var("HF_TOKEN") { // env variable has precedence over on file token. - ApiBuilder::new().with_token(Some(token)) - } else { - ApiBuilder::new() + builder = builder.with_token(Some(token)) }; if let Ok(origin) = env::var("HF_HUB_USER_AGENT_ORIGIN") { builder = builder.with_user_agent("origin", origin.as_str()); diff --git a/router/src/server.rs b/router/src/server.rs index 0346b1f19..45d2b9f3c 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -1522,7 +1522,7 @@ pub async fn run( // Shared API builder initialization let api_builder = || { - let mut builder = ApiBuilder::new().with_progress(false); + let mut builder = ApiBuilder::from_env().with_progress(false); if let Some(token) = authorization_token { builder = builder.with_token(Some(token)); } diff --git a/router/src/validation.rs b/router/src/validation.rs index 87b28eb74..1119347dc 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -699,7 +699,7 @@ fn image_tokens( // TODO: prefer using the config to determine the number of features let num_mm_soft_tokens_per_image = 256; format!( - "\n\n{:?}\n\n", + "\n\n{}\n\n", "".repeat(num_mm_soft_tokens_per_image) ) } diff --git a/server/text_generation_server/adapters/lora.py b/server/text_generation_server/adapters/lora.py index cdcfe91b1..782d66e4e 100644 --- a/server/text_generation_server/adapters/lora.py +++ b/server/text_generation_server/adapters/lora.py @@ -205,7 +205,6 @@ class LoraWeights(AdapterWeights): lora_a_list = [None] * nlayers lora_b_list = [None] * nlayers - # import ipdb; ipdb.set_trace() for layer_id in range(nlayers): key = (layer_id, layer_type) if key not in target_to_layer: diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index 4f25cc192..fb50dda68 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -38,6 +38,7 @@ def paged_attention( *, kv_scales: KVScales, softcap: Optional[float] = None, + window_size_left: Optional[int] = -1, ): # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Copyright 2023 The vLLM team. All rights @@ -79,12 +80,15 @@ def paged_attention( sm_scale=softmax_scale, k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, + window_left=window_size_left, ) elif ATTENTION == "flashdecoding": max_q = 1 max_k = max_s import flash_attn_2_cuda + window_size_right = -1 if window_size_left == -1 else 0 + # TODO fixme when flash contains the fix. # Number of splits is not correctly handled # by the current path @@ -109,8 +113,8 @@ def paged_attention( softmax_scale, False, # zero_tensors True, # causal - -1, # Window_left - -1, # Window right + window_size_left, # Window_left + window_size_right, # Window right softcap, False, # return softmax None, # generator @@ -253,6 +257,7 @@ def attention( sm_scale=softmax_scale, k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, + window_left=window_size_left, ) # If we are using flashdecoding or paged, we always use flash-attn for diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py index d23451844..9479b6067 100644 --- a/server/text_generation_server/layers/attention/flashinfer.py +++ b/server/text_generation_server/layers/attention/flashinfer.py @@ -52,7 +52,6 @@ def use_prefill_with_paged_kv_state( page_size: int, kv_dtype: torch.dtype, q_dtype: torch.dtype, - window_left: int, ): """ Context manager to set the active flashinfer prefill state to the given @@ -95,7 +94,6 @@ def use_prefill_with_paged_kv_state( kv_data_type=kv_dtype, q_data_type=q_dtype, page_size=page_size, - window_left=-1 if window_left is None else window_left, ) yield finally: @@ -172,7 +170,6 @@ def use_decode_state( page_size: int, kv_cache_dtype: torch.dtype, q_dtype: torch.dtype, - window_left: int, ): """ Context manager to set the active flashinfer decoding state to the given @@ -209,7 +206,6 @@ def use_decode_state( page_size=page_size, data_type=kv_cache_dtype, q_data_type=q_dtype, - window_left=-1 if window_left is None else window_left, ) yield finally: diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py index 54422308f..2b89060e9 100644 --- a/server/text_generation_server/layers/attention/ipex.py +++ b/server/text_generation_server/layers/attention/ipex.py @@ -78,6 +78,7 @@ def paged_attention( *, kv_scales: KVScales, softcap: Optional[float] = None, + window_size_left: Optional[int] = -1, ): if softcap is not None: raise NotImplementedError("softcap is not available in IPEX") diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py index 65f3ea414..518e55eee 100644 --- a/server/text_generation_server/layers/attention/rocm.py +++ b/server/text_generation_server/layers/attention/rocm.py @@ -59,6 +59,7 @@ def paged_attention( *, kv_scales: KVScales, softcap: Optional[float] = None, + window_size_left: Optional[int] = -1, ): # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py # Copyright 2023 The vLLM team. All rights @@ -82,6 +83,8 @@ def paged_attention( max_k = max_s import flash_attn_2_cuda + window_size_right = -1 if window_size_left == -1 else 0 + if softcap is None: softcap = 0.0 out = flash_attn_2_cuda.varlen_fwd( @@ -101,8 +104,8 @@ def paged_attention( softmax_scale, False, # zero_tensors True, # causal - -1, # Window_left - -1, # Window right + window_size_left, # Window_left + window_size_right, # Window right softcap, False, # return softmax None, # generator diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 0fdc009ca..ab830b58b 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -272,12 +272,12 @@ class ModelType(enum.Enum): GEMMA3 = { "type": "gemma3", "name": "Gemma3", - "url": "https://huggingface.co/collections/google/gemma-3", + "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d", } GEMMA3_TEXT = { "type": "gemma3_text", "name": "Gemma3 Text", - "url": "https://huggingface.co/collections/google/gemma-3", + "url": "https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d", } COHERE = { "type": "cohere", diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py index ebf1b80eb..2554bd269 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py @@ -287,6 +287,7 @@ class FlashGemma2Attention(torch.nn.Module): max_s, softcap=self.softcap, kv_scales=self.kv_scales, + window_size_left=self.window_size, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py index 085f57ef1..70fe9a3db 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py @@ -281,22 +281,12 @@ class FlashGemma3Attention(torch.nn.Module): padded_query = padded_query.transpose(1, 2).contiguous() padded_key = padded_key.transpose(1, 2).contiguous() padded_value = padded_value.transpose(1, 2).contiguous() - zeros_to_add = torch.zeros( - padded_key.size(0), - self.num_key_value_heads, - 1, - self.head_size, - dtype=padded_key.dtype, - device=padded_key.device, - ) - key_states = torch.cat([padded_key, zeros_to_add], dim=2) - value_states = torch.cat([padded_value, zeros_to_add], dim=2) # Compute attention attn_output = F.scaled_dot_product_attention( padded_query, - key_states, - value_states, + padded_key, + padded_value, attn_mask=attention_mask, scale=self.softmax_scale, enable_gqa=self.enable_gqa, @@ -327,6 +317,7 @@ class FlashGemma3Attention(torch.nn.Module): max_s, softcap=self.softcap, kv_scales=self.kv_scales, + window_size_left=self.window_size, ) return self.o_proj( @@ -513,6 +504,7 @@ class FlashGemma3Model(torch.nn.Module): max_s: int, adapter_data: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, + attention_mask_local: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = inputs_embeds @@ -525,25 +517,6 @@ class FlashGemma3Model(torch.nn.Module): position_ids, max_s, hidden_states.dtype ) - # apply sliding window mask if needed - if layer.self_attn.window_size > 0 and attention_mask is not None: - min_dtype = torch.finfo(hidden_states.dtype).min - # prefill may be larger than sliding window - effective_seq_len = max( - position_ids.shape[0], self.layers[i].self_attn.window_size - ) - sliding_window_mask = torch.tril( - torch.ones_like(attention_mask, dtype=torch.bool), - diagonal=-self.layers[i].self_attn.window_size, - ) - attention_mask = torch.where( - sliding_window_mask, min_dtype, attention_mask - ) - offset = max(0, position_ids.shape[0] - effective_seq_len) - attention_mask = attention_mask[ - :, :, offset : offset + effective_seq_len - ] - hidden_states, residual = layer( hidden_states, residual, @@ -556,7 +529,11 @@ class FlashGemma3Model(torch.nn.Module): seqlen, max_s, adapter_data, - attention_mask, + ( + attention_mask + if self.layers[i].self_attn.window_size == -1 + else attention_mask_local + ), ) hidden_states, _ = self.norm(hidden_states, residual) @@ -723,24 +700,6 @@ class Gemma3ForConditionalGeneration(nn.Module): config.pad_token_id if config.pad_token_id is not None else -1 ) - def get_image_token_mask(self, input_ids): - device = input_ids.device - - start_token_id = self.config.boi_token_index - K = self.config.mm_tokens_per_image - - mask = torch.zeros_like(input_ids, dtype=torch.bool, device=device) - start_positions = (input_ids == start_token_id).nonzero(as_tuple=True)[0] - mask_indices = start_positions.unsqueeze(1) + torch.arange( - 1, K + 1, device=device - ).unsqueeze(0) - - valid_mask = mask_indices < input_ids.size(0) - mask_indices = mask_indices[valid_mask] - mask[mask_indices] = True - - return mask - def get_attention_mask( self, input_ids, max_s, cu_seqlen_prefill, dtype, image_token_mask ): @@ -751,7 +710,7 @@ class Gemma3ForConditionalGeneration(nn.Module): batch_size = len(lengths) sequence_length = max(lengths) - target_length = max_s + target_length = sequence_length # Create the padding mask from the computed lengths. # pad_mask: [batch, sequence_length] where True indicates valid tokens. seq_range = torch.arange(sequence_length, device=device).unsqueeze(0) @@ -847,7 +806,7 @@ class Gemma3ForConditionalGeneration(nn.Module): # # Determine the maximum sequence length (after padding) from query. # sequence_length = max(lengths) - # target_length = max_s + # target_length = sequence_length # # Create the padding mask from the computed lengths. # # pad_mask: [batch, sequence_length] where True indicates valid tokens. @@ -885,6 +844,26 @@ class Gemma3ForConditionalGeneration(nn.Module): # input_ids.device # ) + if attention_mask is not None: + min_dtype = torch.finfo(inputs_embeds.dtype).min + # prefill may be larger than sliding window + effective_seq_len = max( + position_ids.shape[0], self.config.text_config.sliding_window + ) + sliding_window_mask = torch.tril( + torch.ones_like(attention_mask, dtype=torch.bool), + diagonal=-self.config.text_config.sliding_window, + ) + attention_mask_local = torch.where( + sliding_window_mask, min_dtype, attention_mask + ) + offset = max(0, position_ids.shape[0] - effective_seq_len) + attention_mask_local = attention_mask_local[ + :, :, :, offset : offset + effective_seq_len + ] + else: + attention_mask_local = None + hidden_states = self.text_model.model( inputs_embeds=inputs_embeds, position_ids=position_ids, @@ -895,6 +874,7 @@ class Gemma3ForConditionalGeneration(nn.Module): seqlen=seqlen, max_s=max_s, attention_mask=attention_mask, + attention_mask_local=attention_mask_local, ) if lm_head_indices is not None: diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index 0fa172d03..7ad294f4b 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -242,6 +242,7 @@ class MistralAttention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.max_past, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index a45dd1e61..e2a3e5860 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -290,6 +290,7 @@ class MixtralAttention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.max_past, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py index 4ea604510..b1f89eff4 100644 --- a/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py @@ -31,7 +31,7 @@ class PaliGemmaForConditionalGeneration(nn.Module): super().__init__() config.vision_config.quantize = config.quantize self.vision_tower = load_vision_model( - prefix="vision_model" if not prefix else f"{prefix}.vision_model", + prefix="vision_tower" if not prefix else f"{prefix}.vision_tower", config=config.vision_config, weights=weights, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index 9d9562222..f5e4e15ce 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -74,7 +74,7 @@ class Qwen2Attention(torch.nn.Module): weights, ): super().__init__() - self.max_past = ( + self.window_size = ( config.sliding_window if config.sliding_window is not None else -1 ) self.num_heads = config.num_attention_heads @@ -172,7 +172,7 @@ class Qwen2Attention(torch.nn.Module): seqlen=seqlen, block_tables=block_tables, softmax_scale=self.softmax_scale, - window_size_left=self.max_past, + window_size_left=self.window_size, ) # Decode else: @@ -185,6 +185,7 @@ class Qwen2Attention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.window_size, ) return self.o_proj( @@ -405,10 +406,10 @@ class Qwen2ForCausalLM(torch.nn.Module): weights=weights, ) - self.max_past = config.sliding_window - self.max_past_tensor = ( + self.window_size = config.sliding_window + self.window_size_tensor = ( torch.tensor(config.sliding_window, device=weights.device) - if self.max_past is not None + if self.window_size is not None else None ) @@ -430,10 +431,10 @@ class Qwen2ForCausalLM(torch.nn.Module): if prefill_cache_indices is not None: # Slots also need to be sliced as it has the same size as the whole kv tensor slots = slots[prefill_cache_indices] - elif self.max_past is not None: + elif self.window_size is not None: # Clamp in decode mode as paged attention requires clamped values whereas the flash attention # kernel requires the true values - seqlen = seqlen.clamp(max=self.max_past_tensor) + seqlen = seqlen.clamp(max=self.window_size_tensor) inputs_embeds = self.embed_tokens(input_ids) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index 5e090369b..9508cc4f8 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -291,6 +291,7 @@ class Starcoder2Attention(torch.nn.Module): seqlen, max_s, kv_scales=self.kv_scales, + window_size_left=self.max_past, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py index 803d81ead..2972abeab 100644 --- a/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py +++ b/server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py @@ -263,7 +263,7 @@ class Gemma3ImageProcessor(BaseImageProcessor): return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - do_convert_rgb: bool = None, + do_convert_rgb: bool = True, do_pan_and_scan: bool = None, pan_and_scan_min_crop_size: int = None, pan_and_scan_max_num_crops: int = None, diff --git a/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py index 08e39a7c6..6bdf35c63 100644 --- a/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py +++ b/server/text_generation_server/models/custom_modeling/gemma3/processing_gemma3.py @@ -82,7 +82,7 @@ class Gemma3Processor(ProcessorMixin): do_rescale=False, resample=PILImageResampling.BILINEAR, ) - # import ipdb; ipdb.set_trace() + self.image_token_id = tokenizer.image_token_id image_tokens_expanded = "".join( [tokenizer.image_token] * num_mm_soft_tokens_per_image @@ -91,8 +91,6 @@ class Gemma3Processor(ProcessorMixin): f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n" ) - # import ipdb; ipdb.set_trace() - self.image_processor = image_processor self.tokenizer = tokenizer self.chat_template = chat_template diff --git a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py index e317c5b56..066de6a20 100644 --- a/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py +++ b/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py @@ -633,7 +633,7 @@ class Qwen2_5VisionModel(nn.Module): config=config, weights=weights, ) - # import ipdb; ipdb.set_trace() + self.temporal_patch_size = config.temporal_patch_size self.spatial_patch_size = config.spatial_patch_size self.in_channels = config.in_channels diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index e268af8b4..d3a83e271 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -83,24 +83,11 @@ from text_generation_server.models.metadata_kernels import ( tracer = trace.get_tracer(__name__) -# Will be set in init -SLIDING_WINDOW: Optional[int] = None - def small_power_of_2(n: int): return 1 << ((n - 1).bit_length() - 1) -def set_sliding_window(sliding_window: int): - global SLIDING_WINDOW - SLIDING_WINDOW = sliding_window - - -def get_sliding_windows() -> int: - global SLIDING_WINDOW - return SLIDING_WINDOW - - def init_cpu_threads_env(rank_id: int, world_size: int): import importlib.util @@ -1002,10 +989,8 @@ class FlashCausalLMBatch(Batch): self.slot_indices, ) - sliding_window = get_sliding_windows() position_ids = [] slot_indices = [] - prefill_cache_indices = [] all_prefill_logprobs = True no_prefill_logprobs = True prefill_cu_outlens = [0] @@ -1064,14 +1049,6 @@ class FlashCausalLMBatch(Batch): # Update cumulative_slot_tokens += len(request_slots) - # Create tensor to slice into the kv tensor in prefill - if sliding_window is not None: - request_prefill_cache_indices = torch.arange( - cumulative_length + max(0, input_length - sliding_window), - cumulative_length + input_length, - dtype=torch.int64, - ) - # Prefill logprobs is ignored if the request is done prefilling prefill_logprobs = r.prefill_logprobs and request_prefilling @@ -1085,9 +1062,6 @@ class FlashCausalLMBatch(Batch): prefill_cu_outlens.append(prefill_out_cumulative_length + 1) prefill_out_cumulative_length += 1 - if sliding_window is not None: - prefill_cache_indices.append(request_prefill_cache_indices) - ADAPTER_TO_INDEX = get_adapter_to_index() if ADAPTER_TO_INDEX: adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0) @@ -1151,24 +1125,18 @@ class FlashCausalLMBatch(Batch): position_ids = torch.cat(position_ids) if slot_indices: slot_indices = torch.cat(slot_indices) - if sliding_window is not None: - prefill_cache_indices = torch.cat(prefill_cache_indices) else: if position_ids: position_ids = position_ids[0] if slot_indices: slot_indices = slot_indices[0] - if sliding_window is not None: - prefill_cache_indices = prefill_cache_indices[0] if not has_triton(): self.position_ids = position_ids.to(device) self.slot_indices = slot_indices.to(device) self.prefill_cu_outlens = prefill_cu_outlens - self.prefill_cache_indices = ( - prefill_cache_indices.to(device) if sliding_window is not None else None - ) + self.prefill_cache_indices = None if all_prefill_logprobs: prefill_head_indices = None @@ -1306,9 +1274,7 @@ class FlashCausalLM(Model): if text_config is not None: config = text_config - if getattr(config, "sliding_window", None) is not None: - set_sliding_window(config.sliding_window) - else: + if getattr(config, "sliding_window", None) is None: config.sliding_window = None self.num_layers = config.num_hidden_layers @@ -2500,7 +2466,6 @@ class FlashCausalLM(Model): page_size=BLOCK_SIZE, kv_dtype=self.kv_cache_dtype, q_dtype=self.dtype, - window_left=self.sliding_window, ) else: assert input_lengths_tensor is not None @@ -2514,5 +2479,4 @@ class FlashCausalLM(Model): page_size=BLOCK_SIZE, kv_cache_dtype=self.kv_cache_dtype, q_dtype=self.dtype, - window_left=self.sliding_window, ) diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py index af4d1f082..da317a628 100644 --- a/server/text_generation_server/models/model.py +++ b/server/text_generation_server/models/model.py @@ -110,7 +110,7 @@ class Model(ABC): requires_padding=self.requires_padding, dtype=str(self.dtype), device_type=self.device.type, - window_size=self.sliding_window, + window_size=None, # Setting this parameter to None disabled the block logic with sliding window. speculate=self.speculate, support_chunking=self.support_chunking, use_prefix_caching=PREFIX_CACHING,