diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json new file mode 100644 index 00000000..5524bbad --- /dev/null +++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics.json @@ -0,0 +1,168 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4911, + "logprob": -5.015625, + "text": "User" + }, + { + "id": 29901, + "logprob": -0.006000519, + "text": ":" + }, + { + "id": 32000, + "logprob": -0.79248047, + "text": "" + }, + { + "id": 32001, + "logprob": -0.0001295805, + "text": "" + }, + { + "id": 32000, + "logprob": -1.1920929e-07, + "text": "" + }, + { + "id": 1815, + "logprob": -4.2734375, + "text": "Can" + }, + { + "id": 366, + "logprob": -0.013046265, + "text": "you" + }, + { + "id": 2649, + "logprob": -5.0234375, + "text": "tell" + }, + { + "id": 592, + "logprob": -0.30688477, + "text": "me" + }, + { + "id": 263, + "logprob": -3.5195312, + "text": "a" + }, + { + "id": 1407, + "logprob": -9.4375, + "text": "very" + }, + { + "id": 3273, + "logprob": -2.0410156, + "text": "short" + }, + { + "id": 5828, + "logprob": -0.28393555, + "text": "story" + }, + { + "id": 2729, + "logprob": -3.2636719, + "text": "based" + }, + { + "id": 373, + "logprob": -0.0007638931, + "text": "on" + }, + { + "id": 278, + "logprob": -0.14746094, + "text": "the" + }, + { + "id": 1967, + "logprob": -0.072387695, + "text": "image" + }, + { + "id": 29973, + "logprob": -0.1809082, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 32002, + "logprob": -0.007347107, + "special": true, + "text": "" + }, + { + "id": 29871, + "logprob": -6.6161156e-05, + "special": false, + "text": " " + }, + { + "id": 13, + "logprob": -2.69413e-05, + "special": false, + "text": "\n" + }, + { + "id": 7900, + "logprob": -3.8146973e-06, + "special": false, + "text": "Ass" + }, + { + "id": 22137, + "logprob": 0.0, + "special": false, + "text": "istant" + }, + { + "id": 29901, + "logprob": -2.861023e-06, + "special": false, + "text": ":" + }, + { + "id": 319, + "logprob": -0.90234375, + "special": false, + "text": " A" + }, + { + "id": 696, + "logprob": -1.4306641, + "special": false, + "text": " ro" + }, + { + "id": 15664, + "logprob": -0.0006227493, + "special": false, + "text": "oster" + }, + { + "id": 15028, + "logprob": -1.1425781, + "special": false, + "text": " stands" + } + ] + }, + "generated_text": "\nAssistant: A rooster stands" +} diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json new file mode 100644 index 00000000..9ee62354 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_load.json @@ -0,0 +1,674 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4911, + "logprob": -5.015625, + "text": "User" + }, + { + "id": 29901, + "logprob": -0.006000519, + "text": ":" + }, + { + "id": 32000, + "logprob": -0.79248047, + "text": "" + }, + { + "id": 32001, + "logprob": -0.0001295805, + "text": "" + }, + { + "id": 32000, + "logprob": -1.1920929e-07, + "text": "" + }, + { + "id": 1815, + "logprob": -4.2734375, + "text": "Can" + }, + { + "id": 366, + "logprob": -0.013046265, + "text": "you" + }, + { + "id": 2649, + "logprob": -5.0234375, + "text": "tell" + }, + { + "id": 592, + "logprob": -0.30688477, + "text": "me" + }, + { + "id": 263, + "logprob": -3.5195312, + "text": "a" + }, + { + "id": 1407, + "logprob": -9.4375, + "text": "very" + }, + { + "id": 3273, + "logprob": -2.0410156, + "text": "short" + }, + { + "id": 5828, + "logprob": -0.28393555, + "text": "story" + }, + { + "id": 2729, + "logprob": -3.2636719, + "text": "based" + }, + { + "id": 373, + "logprob": -0.0007638931, + "text": "on" + }, + { + "id": 278, + "logprob": -0.14746094, + "text": "the" + }, + { + "id": 1967, + "logprob": -0.072387695, + "text": "image" + }, + { + "id": 29973, + "logprob": -0.1809082, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 32002, + "logprob": -0.007347107, + "special": true, + "text": "" + }, + { + "id": 29871, + "logprob": -6.604195e-05, + "special": false, + "text": " " + }, + { + "id": 13, + "logprob": -2.69413e-05, + "special": false, + "text": "\n" + }, + { + "id": 7900, + "logprob": -3.8146973e-06, + "special": false, + "text": "Ass" + }, + { + "id": 22137, + "logprob": 0.0, + "special": false, + "text": "istant" + }, + { + "id": 29901, + "logprob": -2.861023e-06, + "special": false, + "text": ":" + }, + { + "id": 319, + "logprob": -0.9111328, + "special": false, + "text": " A" + }, + { + "id": 696, + "logprob": -1.4238281, + "special": false, + "text": " ro" + }, + { + "id": 15664, + "logprob": -0.0006213188, + "special": false, + "text": "oster" + }, + { + "id": 15028, + "logprob": -1.1386719, + "special": false, + "text": " stands" + } + ] + }, + "generated_text": "\nAssistant: A rooster stands" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4911, + "logprob": -5.0195312, + "text": "User" + }, + { + "id": 29901, + "logprob": -0.006023407, + "text": ":" + }, + { + "id": 32000, + "logprob": -0.7915039, + "text": "" + }, + { + "id": 32001, + "logprob": -0.00013053417, + "text": "" + }, + { + "id": 32000, + "logprob": -1.1920929e-07, + "text": "" + }, + { + "id": 1815, + "logprob": -4.2773438, + "text": "Can" + }, + { + "id": 366, + "logprob": -0.013046265, + "text": "you" + }, + { + "id": 2649, + "logprob": -5.0234375, + "text": "tell" + }, + { + "id": 592, + "logprob": -0.3059082, + "text": "me" + }, + { + "id": 263, + "logprob": -3.5253906, + "text": "a" + }, + { + "id": 1407, + "logprob": -9.4375, + "text": "very" + }, + { + "id": 3273, + "logprob": -2.0410156, + "text": "short" + }, + { + "id": 5828, + "logprob": -0.28344727, + "text": "story" + }, + { + "id": 2729, + "logprob": -3.2617188, + "text": "based" + }, + { + "id": 373, + "logprob": -0.00075006485, + "text": "on" + }, + { + "id": 278, + "logprob": -0.14746094, + "text": "the" + }, + { + "id": 1967, + "logprob": -0.07305908, + "text": "image" + }, + { + "id": 29973, + "logprob": -0.18078613, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 32002, + "logprob": -0.007396698, + "special": true, + "text": "" + }, + { + "id": 29871, + "logprob": -6.663799e-05, + "special": false, + "text": " " + }, + { + "id": 13, + "logprob": -2.670288e-05, + "special": false, + "text": "\n" + }, + { + "id": 7900, + "logprob": -3.8146973e-06, + "special": false, + "text": "Ass" + }, + { + "id": 22137, + "logprob": 0.0, + "special": false, + "text": "istant" + }, + { + "id": 29901, + "logprob": -2.861023e-06, + "special": false, + "text": ":" + }, + { + "id": 319, + "logprob": -0.8979492, + "special": false, + "text": " A" + }, + { + "id": 696, + "logprob": -1.4296875, + "special": false, + "text": " ro" + }, + { + "id": 15664, + "logprob": -0.0006093979, + "special": false, + "text": "oster" + }, + { + "id": 15028, + "logprob": -1.1474609, + "special": false, + "text": " stands" + } + ] + }, + "generated_text": "\nAssistant: A rooster stands" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4911, + "logprob": -5.0195312, + "text": "User" + }, + { + "id": 29901, + "logprob": -0.006023407, + "text": ":" + }, + { + "id": 32000, + "logprob": -0.7915039, + "text": "" + }, + { + "id": 32001, + "logprob": -0.00013029575, + "text": "" + }, + { + "id": 32000, + "logprob": -1.1920929e-07, + "text": "" + }, + { + "id": 1815, + "logprob": -4.2773438, + "text": "Can" + }, + { + "id": 366, + "logprob": -0.013046265, + "text": "you" + }, + { + "id": 2649, + "logprob": -5.0234375, + "text": "tell" + }, + { + "id": 592, + "logprob": -0.3059082, + "text": "me" + }, + { + "id": 263, + "logprob": -3.5253906, + "text": "a" + }, + { + "id": 1407, + "logprob": -9.4375, + "text": "very" + }, + { + "id": 3273, + "logprob": -2.0410156, + "text": "short" + }, + { + "id": 5828, + "logprob": -0.28344727, + "text": "story" + }, + { + "id": 2729, + "logprob": -3.2617188, + "text": "based" + }, + { + "id": 373, + "logprob": -0.00075006485, + "text": "on" + }, + { + "id": 278, + "logprob": -0.14746094, + "text": "the" + }, + { + "id": 1967, + "logprob": -0.07305908, + "text": "image" + }, + { + "id": 29973, + "logprob": -0.18078613, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 32002, + "logprob": -0.007396698, + "special": true, + "text": "" + }, + { + "id": 29871, + "logprob": -6.6399574e-05, + "special": false, + "text": " " + }, + { + "id": 13, + "logprob": -2.682209e-05, + "special": false, + "text": "\n" + }, + { + "id": 7900, + "logprob": -3.8146973e-06, + "special": false, + "text": "Ass" + }, + { + "id": 22137, + "logprob": 0.0, + "special": false, + "text": "istant" + }, + { + "id": 29901, + "logprob": -2.861023e-06, + "special": false, + "text": ":" + }, + { + "id": 319, + "logprob": -0.8979492, + "special": false, + "text": " A" + }, + { + "id": 696, + "logprob": -1.4296875, + "special": false, + "text": " ro" + }, + { + "id": 15664, + "logprob": -0.0006093979, + "special": false, + "text": "oster" + }, + { + "id": 15028, + "logprob": -1.1474609, + "special": false, + "text": " stands" + } + ] + }, + "generated_text": "\nAssistant: A rooster stands" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4911, + "logprob": -5.0195312, + "text": "User" + }, + { + "id": 29901, + "logprob": -0.006023407, + "text": ":" + }, + { + "id": 32000, + "logprob": -0.7915039, + "text": "" + }, + { + "id": 32001, + "logprob": -0.00013041496, + "text": "" + }, + { + "id": 32000, + "logprob": -1.1920929e-07, + "text": "" + }, + { + "id": 1815, + "logprob": -4.2773438, + "text": "Can" + }, + { + "id": 366, + "logprob": -0.013046265, + "text": "you" + }, + { + "id": 2649, + "logprob": -5.0234375, + "text": "tell" + }, + { + "id": 592, + "logprob": -0.3059082, + "text": "me" + }, + { + "id": 263, + "logprob": -3.5253906, + "text": "a" + }, + { + "id": 1407, + "logprob": -9.4375, + "text": "very" + }, + { + "id": 3273, + "logprob": -2.0410156, + "text": "short" + }, + { + "id": 5828, + "logprob": -0.28344727, + "text": "story" + }, + { + "id": 2729, + "logprob": -3.2617188, + "text": "based" + }, + { + "id": 373, + "logprob": -0.00075006485, + "text": "on" + }, + { + "id": 278, + "logprob": -0.14746094, + "text": "the" + }, + { + "id": 1967, + "logprob": -0.07305908, + "text": "image" + }, + { + "id": 29973, + "logprob": -0.18078613, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 32002, + "logprob": -0.007396698, + "special": true, + "text": "" + }, + { + "id": 29871, + "logprob": -6.6399574e-05, + "special": false, + "text": " " + }, + { + "id": 13, + "logprob": -2.69413e-05, + "special": false, + "text": "\n" + }, + { + "id": 7900, + "logprob": -3.8146973e-06, + "special": false, + "text": "Ass" + }, + { + "id": 22137, + "logprob": 0.0, + "special": false, + "text": "istant" + }, + { + "id": 29901, + "logprob": -2.861023e-06, + "special": false, + "text": ":" + }, + { + "id": 319, + "logprob": -0.8979492, + "special": false, + "text": " A" + }, + { + "id": 696, + "logprob": -1.4296875, + "special": false, + "text": " ro" + }, + { + "id": 15664, + "logprob": -0.0006093979, + "special": false, + "text": "oster" + }, + { + "id": 15028, + "logprob": -1.1474609, + "special": false, + "text": " stands" + } + ] + }, + "generated_text": "\nAssistant: A rooster stands" + } +] diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py new file mode 100644 index 00000000..6c2afe24 --- /dev/null +++ b/integration-tests/models/test_idefics.py @@ -0,0 +1,46 @@ +import pytest + + +@pytest.fixture(scope="module") +def idefics_handle(launcher): + with launcher( + "HuggingFaceM4/idefics-9b-instruct", num_shard=1 + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def idefics(idefics_handle): + await idefics_handle.health(300) + return idefics_handle.client + + +@pytest.mark.asyncio +async def test_idefics(idefics, response_snapshot): + response = await idefics.generate( + "User:![](https://temp-5681.s3.us-west-2.amazonaws.com/chicken_on_money.png)Can you tell me a very short story based on the image?", + max_new_tokens=10, + decoder_input_details=True, + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +async def test_idefics_load(idefics, generate_load, response_snapshot): + responses = await generate_load( + idefics, + "User:![](https://temp-5681.s3.us-west-2.amazonaws.com/chicken_on_money.png)Can you tell me a very short story based on the image?", + max_new_tokens=10, + n=4, + ) + + generated_texts = [r.generated_text for r in responses] + + assert len(generated_texts) == 4 + assert generated_texts, all( + [text == generated_texts[0] for text in generated_texts] + ) + + assert responses == response_snapshot diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py index 873a27d4..f79caa9d 100644 --- a/server/text_generation_server/models/idefics_causal_lm.py +++ b/server/text_generation_server/models/idefics_causal_lm.py @@ -336,6 +336,7 @@ class IdeficsCausalLMBatch(Batch): attention_mask = None position_ids = None pixel_values = None + image_hidden_states = None image_attention_mask = None past_key_values = [] @@ -520,6 +521,7 @@ class IdeficsCausalLMBatch(Batch): attention_mask=attention_mask, position_ids=position_ids, pixel_values=pixel_values, + image_hidden_states=image_hidden_states, image_attention_mask=image_attention_mask, past_key_values=past_key_values, all_input_ids=all_input_ids,