text-generation-inference/integration-tests/models/test_mllama.py

import pytest
import asyncio


@pytest.fixture(scope="module")
def mllama_handle(launcher):
    with launcher(
        "meta-llama/Llama-3.2-11B-Vision-Instruct",
        num_shard=2,
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def mllama(mllama_handle):
    await mllama_handle.health(300)
    return mllama_handle.client


@pytest.mark.asyncio
async def test_mllama_simpl(mllama, response_snapshot):
    response = await mllama.chat(
        max_tokens=10,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Can you tell me a very short story based on the image?",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
                        },
                    },
                ],
            },
        ],
    )

    assert response.usage == {
        "completion_tokens": 10,
        "prompt_tokens": 50,
        "total_tokens": 60,
    }
    assert (
        response.choices[0].message.content
        == "In a bustling city, a chicken named Cluck"
    )
    assert response == response_snapshot


@pytest.mark.release
@pytest.mark.asyncio
async def test_mllama_load(mllama, generate_load, response_snapshot):
    futures = [
        mllama.chat(
            max_tokens=10,
            temperature=0.0,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Can you tell me a very short story based on the image?",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": "https://raw.githubusercontent.com/huggingface/text-generation-inference/main/integration-tests/images/chicken_on_money.png"
                            },
                        },
                    ],
                },
            ],
        )
        # TODO with v3, 4 breaks here. Nothing accounts of the image VRAM
        # because mllama is the only one doing its thing.
        for i in range(2)
    ]
    responses = await asyncio.gather(*futures)

    _ = [response.choices[0].message.content for response in responses]

    # XXX: TODO: Fix this test.
    # assert generated_texts[0] == "In a bustling city, a chicken named Cluck"
    # assert len(generated_texts) == 4
    # assert generated_texts, all(
    #     [text == generated_texts[0] for text in generated_texts]
    # )
    # assert responses == response_snapshot