text-generation-inference/integration-tests/models/test_completion_prompts.py

import pytest
import requests
from openai import OpenAI
from huggingface_hub import InferenceClient


@pytest.fixture(scope="module")
def flash_llama_completion_handle(launcher):
    with launcher(
        "meta-llama/Meta-Llama-3.1-8B-Instruct",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_completion(flash_llama_completion_handle):
    await flash_llama_completion_handle.health(300)
    return flash_llama_completion_handle.client


# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
# method for it. Instead, we use the `requests` library to make the HTTP request directly.


@pytest.mark.release
def test_flash_llama_completion_single_prompt(
    flash_llama_completion, response_snapshot
):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
            "prompt": "What is Deep Learning?",
            "max_tokens": 10,
            "temperature": 0.0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 1
    assert (
        response["choices"][0]["text"]
        == " A Beginner’s Guide\nDeep learning is a subset"
    )
    assert response == response_snapshot


@pytest.mark.release
async def test_flash_llama_completion_stream_usage(
    flash_llama_completion, response_snapshot
):
    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.chat_completion(
        model="tgi",
        messages=[
            {
                "role": "user",
                "content": "What is Deep Learning?",
            }
        ],
        max_tokens=10,
        temperature=0.0,
        stream_options={"include_usage": True},
        stream=True,
    )
    string = ""
    chunks = []
    had_usage = False
    for chunk in stream:
        # remove "data:"
        chunks.append(chunk)
        if len(chunk.choices) == 1:
            index = chunk.choices[0].index
            assert index == 0
            string += chunk.choices[0].delta.content
        if chunk.usage:
            assert not had_usage
            had_usage = True

    assert had_usage
    assert (
        string
        == "**Deep Learning: An Overview**\n=====================================\n\n"
    )
    assert chunks == response_snapshot

    stream = client.chat_completion(
        model="tgi",
        messages=[
            {
                "role": "user",
                "content": "What is Deep Learning?",
            }
        ],
        max_tokens=10,
        temperature=0.0,
        # No usage
        # stream_options={"include_usage": True},
        stream=True,
    )
    string = ""
    chunks = []
    had_usage = False
    for chunk in stream:
        chunks.append(chunk)
        assert chunk.usage is None
        assert len(chunk.choices) == 1
        assert chunk.choices[0].index == 0
        string += chunk.choices[0].delta.content
    assert (
        string
        == "**Deep Learning: An Overview**\n=====================================\n\n"
    )


@pytest.mark.release
def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
            "prompt": [
                "What is Deep Learning?",
                "Is water wet?",
                "What is the capital of France?",
                "def mai",
            ],
            "max_tokens": 10,
            "seed": 0,
            "temperature": 0.0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 4

    all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
    all_indexes.sort()
    all_indices, all_strings = zip(*all_indexes)
    assert list(all_indices) == [0, 1, 2, 3]
    assert list(all_strings) == [
        " A Beginner’s Guide\nDeep learning is a subset",
        " This is a question that has puzzled many people for",
        " Paris\nWhat is the capital of France?\nThe",
        'usculas_minusculas(s):\n    """\n',
    ]

    assert response == response_snapshot


@pytest.mark.release
async def test_flash_llama_completion_many_prompts_stream(
    flash_llama_completion, response_snapshot
):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.completions.create(
        model="tgi",
        prompt=[
            "What is Deep Learning?",
            "Is water wet?",
            "What is the capital of France?",
            "def mai",
        ],
        max_tokens=10,
        seed=0,
        temperature=0.0,
        stream=True,
    )

    strings = [""] * 4
    chunks = []
    for chunk in stream:
        chunks.append(chunk)
        index = chunk.choices[0].index
        assert 0 <= index <= 4
        strings[index] += chunk.choices[0].text

    assert list(strings) == [
        " A Beginner’s Guide\nDeep learning is a subset",
        " This is a question that has puzzled many people for",
        " Paris\nWhat is the capital of France?\nThe",
        'usculas_minusculas(s):\n    """\n',
    ]
    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")

    stream = client.chat.completions.create(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": True},
    )

    chunks = []
    for chunk in stream:
        chunks.append(chunk)
    for chunk in chunks[:-1]:
        assert chunk.usage is None
    for chunk in chunks[-1:]:
        assert chunk.usage is not None

    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")

    stream = client.chat.completions.create(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": False},
    )

    chunks = []
    for chunk in stream:
        assert chunk.usage is None
        chunks.append(chunk)

    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.chat_completion(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": True},
    )

    chunks = []
    for chunk in stream:
        chunks.append(chunk)

    for chunk in chunks[:-1]:
        assert chunk.usage is None
    for chunk in chunks[-1:]:
        assert chunk.usage is not None

    assert chunks == response_snapshot


@pytest.mark.release
async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
    stream = client.chat_completion(
        model="tgi",
        messages=[{"role": "user", "content": "Say 'OK!'"}],
        stream=True,
        max_tokens=10,
        seed=42,
        stream_options={"include_usage": False},
    )

    chunks = []
    for chunk in stream:
        assert chunk.usage is None
        chunks.append(chunk)

    assert chunks == response_snapshot
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								import pytest
 								import requests
-												Pr 3003 ci branch (#3007)

* change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.

* Clippy.

* Tweak for multi prompt.

* Ruff.

* Update the snapshot a bit.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2025-03-10 16:56:19 +00:00
+								from openai import OpenAI
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								from huggingface_hub import InferenceClient
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
 								@pytest.fixture(scope="module")
 								def flash_llama_completion_handle(launcher):
 								    with launcher(
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								        "meta-llama/Meta-Llama-3.1-8B-Instruct",
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								    ) as handle:
 								        yield handle
 								@pytest.fixture(scope="module")
 								async def flash_llama_completion(flash_llama_completion_handle):
 								    await flash_llama_completion_handle.health(300)
 								    return flash_llama_completion_handle.client
 								# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
 								# method for it. Instead, we use the `requests` library to make the HTTP request directly.
-												Add pytest release marker (#2114)

* Add pytest release marker

Annotate a test with `@pytest.mark.release` and it only gets run
with `pytest integration-tests --release`.

* Mark many models as `release` to speed up CI
											
										
										
											2024-06-25 14:53:20 +00:00
+								@pytest.mark.release
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								def test_flash_llama_completion_single_prompt(
 								    flash_llama_completion, response_snapshot
 								):
 								    response = requests.post(
 								        f"{flash_llama_completion.base_url}/v1/completions",
 								        json={
 								            "model": "tgi",
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								            "prompt": "What is Deep Learning?",
 								            "max_tokens": 10,
 								            "temperature": 0.0,
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								        },
 								        headers=flash_llama_completion.headers,
 								        stream=False,
 								    )
 								    response = response.json()
 								    assert len(response["choices"]) == 1
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								    assert (
 								        response["choices"][0]["text"]
 								        == " A Beginner’s Guide\nDeep learning is a subset"
 								    )
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								    assert response == response_snapshot
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
+								@pytest.mark.release
 								async def test_flash_llama_completion_stream_usage(
 								    flash_llama_completion, response_snapshot
 								):
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
 								    stream = client.chat_completion(
 								        model="tgi",
 								        messages=[
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
+								            {
 								                "role": "user",
 								                "content": "What is Deep Learning?",
 								            }
 								        ],
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								        max_tokens=10,
 								        temperature=0.0,
 								        stream_options={"include_usage": True},
 								        stream=True,
 								    )
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
+								    string = ""
 								    chunks = []
 								    had_usage = False
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								    for chunk in stream:
 								        # remove "data:"
 								        chunks.append(chunk)
 								        if len(chunk.choices) == 1:
 								            index = chunk.choices[0].index
 								            assert index == 0
 								            string += chunk.choices[0].delta.content
 								        if chunk.usage:
 								            assert not had_usage
 								            had_usage = True
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
 								    assert had_usage
 								    assert (
 								        string
 								        == "**Deep Learning: An Overview**\n=====================================\n\n"
 								    )
 								    assert chunks == response_snapshot
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								    stream = client.chat_completion(
 								        model="tgi",
 								        messages=[
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
+								            {
 								                "role": "user",
 								                "content": "What is Deep Learning?",
 								            }
 								        ],
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								        max_tokens=10,
 								        temperature=0.0,
 								        # No usage
 								        # stream_options={"include_usage": True},
 								        stream=True,
 								    )
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
+								    string = ""
 								    chunks = []
 								    had_usage = False
-												Fix tool call2 (#3076)

* Making `tool_calls` a vector.

* Arguments output is a string.

* Update all the integration tests.

* Add the requirements.

* Upgrade other tests.

* Clippy.

* Update the old test.
											
										
										
											2025-03-07 18:45:57 +00:00
+								    for chunk in stream:
 								        chunks.append(chunk)
 								        assert chunk.usage is None
 								        assert len(chunk.choices) == 1
 								        assert chunk.choices[0].index == 0
 								        string += chunk.choices[0].delta.content
-												Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
											
										
										
											2024-09-19 18:50:37 +00:00
+								    assert (
 								        string
 								        == "**Deep Learning: An Overview**\n=====================================\n\n"
 								    )
-												Add pytest release marker (#2114)

* Add pytest release marker

Annotate a test with `@pytest.mark.release` and it only gets run
with `pytest integration-tests --release`.

* Mark many models as `release` to speed up CI
											
										
										
											2024-06-25 14:53:20 +00:00
+								@pytest.mark.release
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
 								    response = requests.post(
 								        f"{flash_llama_completion.base_url}/v1/completions",
 								        json={
 								            "model": "tgi",
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								            "prompt": [
 								                "What is Deep Learning?",
 								                "Is water wet?",
 								                "What is the capital of France?",
 								                "def mai",
 								            ],
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								            "max_tokens": 10,
 								            "seed": 0,
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								            "temperature": 0.0,
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								        },
 								        headers=flash_llama_completion.headers,
 								        stream=False,
 								    )
 								    response = response.json()
 								    assert len(response["choices"]) == 4
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								    all_indexes = [(choice["index"], choice["text"]) for choice in response["choices"]]
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								    all_indexes.sort()
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								    all_indices, all_strings = zip(*all_indexes)
 								    assert list(all_indices) == [0, 1, 2, 3]
 								    assert list(all_strings) == [
 								        " A Beginner’s Guide\nDeep learning is a subset",
 								        " This is a question that has puzzled many people for",
 								        " Paris\nWhat is the capital of France?\nThe",
 								        'usculas_minusculas(s):\n    """\n',
 								    ]
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
 								    assert response == response_snapshot
-												Add pytest release marker (#2114)

* Add pytest release marker

Annotate a test with `@pytest.mark.release` and it only gets run
with `pytest integration-tests --release`.

* Mark many models as `release` to speed up CI
											
										
										
											2024-06-25 14:53:20 +00:00
+								@pytest.mark.release
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								async def test_flash_llama_completion_many_prompts_stream(
 								    flash_llama_completion, response_snapshot
 								):
-												Pr 3003 ci branch (#3007)

* change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.

* Clippy.

* Tweak for multi prompt.

* Ruff.

* Update the snapshot a bit.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2025-03-10 16:56:19 +00:00
+								    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
 								    stream = client.completions.create(
 								        model="tgi",
 								        prompt=[
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								            "What is Deep Learning?",
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								            "Is water wet?",
 								            "What is the capital of France?",
 								            "def mai",
 								        ],
-												Pr 3003 ci branch (#3007)

* change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.

* Clippy.

* Tweak for multi prompt.

* Ruff.

* Update the snapshot a bit.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2025-03-10 16:56:19 +00:00
+								        max_tokens=10,
 								        seed=0,
 								        temperature=0.0,
 								        stream=True,
 								    )
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								    strings = [""] * 4
-												Pr 3003 ci branch (#3007)

* change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.

* Clippy.

* Tweak for multi prompt.

* Ruff.

* Update the snapshot a bit.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2025-03-10 16:56:19 +00:00
+								    chunks = []
 								    for chunk in stream:
 								        chunks.append(chunk)
 								        index = chunk.choices[0].index
 								        assert 0 <= index <= 4
 								        strings[index] += chunk.choices[0].text
-												Prefix test - Different kind of load test to trigger prefix test bugs. (#2490)

* Adding prefix test.

* [WIP] tmp dump of integration load tests.

* Remove other tensor creation.

* Fixed the radix tree.

Used a slice everywhere in radix.rs to keep the cheap Arc cloning
instead of recomputing the input_ids.

* Fix parsing

* Is it really flashinfer version ?

* Remove some comments.

* Revert the max prefix hit.

* Adding numpy to diff.

* Upgraded flashinfer.

* Upgrading some stuff.

* Are we done yet ?

* Minor fixup

* Remove 1 log and put back the other.

* Add comment for why slot 0 is OK.

* Mounting on the job.

* Get me a debug branch

* Debugging CIs is fun.

* Attempt #28

* wip

* Tmate.

* Praying.

* Updating VLM causal model with updated context.

* Important line got squashed.

* Tmate again.

* Fingers crossed.

* We want only 1 run of integration tests.....

---------

Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
											
										
										
											2024-09-11 16:10:40 +00:00
+								    assert list(strings) == [
 								        " A Beginner’s Guide\nDeep learning is a subset",
 								        " This is a question that has puzzled many people for",
 								        " Paris\nWhat is the capital of France?\nThe",
 								        'usculas_minusculas(s):\n    """\n',
 								    ]
-												feat: accept list as prompt and use first string (#1702)

This PR allows the `CompletionRequest.prompt` to be sent as a string or
array of strings. When an array is sent the first value will be used if
it's a string; otherwise the according error will be thrown

Fixes:
https://github.com/huggingface/text-generation-inference/issues/1690
Similar to: https://github.com/vllm-project/vllm/pull/323/files
											
										
										
											2024-04-17 08:41:12 +00:00
+								    assert chunks == response_snapshot
-												Pr 3003 ci branch (#3007)

* change ChatCompletionChunk to align with "OpenAI Chat Completions streaming API"

Moving after tool_calls2

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

add in Buffering..

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

fix: handle usage outside of stream state and add tests

Simplifying everything quite a bit.

Remove the unused model_dump.

Clippy.

Clippy ?

Ruff.

Uppgrade the flake for latest transformers.

Upgrade after rebase.

Remove potential footgun.

Fix completion test.

* Clippy.

* Tweak for multi prompt.

* Ruff.

* Update the snapshot a bit.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2025-03-10 16:56:19 +00:00
 								@pytest.mark.release
 								async def test_chat_openai_usage(flash_llama_completion, response_snapshot):
 								    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
 								    stream = client.chat.completions.create(
 								        model="tgi",
 								        messages=[{"role": "user", "content": "Say 'OK!'"}],
 								        stream=True,
 								        max_tokens=10,
 								        seed=42,
 								        stream_options={"include_usage": True},
 								    )
 								    chunks = []
 								    for chunk in stream:
 								        chunks.append(chunk)
 								    for chunk in chunks[:-1]:
 								        assert chunk.usage is None
 								    for chunk in chunks[-1:]:
 								        assert chunk.usage is not None
 								    assert chunks == response_snapshot
 								@pytest.mark.release
 								async def test_chat_openai_nousage(flash_llama_completion, response_snapshot):
 								    client = OpenAI(api_key="xx", base_url=f"{flash_llama_completion.base_url}/v1")
 								    stream = client.chat.completions.create(
 								        model="tgi",
 								        messages=[{"role": "user", "content": "Say 'OK!'"}],
 								        stream=True,
 								        max_tokens=10,
 								        seed=42,
 								        stream_options={"include_usage": False},
 								    )
 								    chunks = []
 								    for chunk in stream:
 								        assert chunk.usage is None
 								        chunks.append(chunk)
 								    assert chunks == response_snapshot
 								@pytest.mark.release
 								async def test_chat_hfhub_usage(flash_llama_completion, response_snapshot):
 								    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
 								    stream = client.chat_completion(
 								        model="tgi",
 								        messages=[{"role": "user", "content": "Say 'OK!'"}],
 								        stream=True,
 								        max_tokens=10,
 								        seed=42,
 								        stream_options={"include_usage": True},
 								    )
 								    chunks = []
 								    for chunk in stream:
 								        chunks.append(chunk)
 								    for chunk in chunks[:-1]:
 								        assert chunk.usage is None
 								    for chunk in chunks[-1:]:
 								        assert chunk.usage is not None
 								    assert chunks == response_snapshot
 								@pytest.mark.release
 								async def test_chat_hfhub_nousage(flash_llama_completion, response_snapshot):
 								    client = InferenceClient(base_url=f"{flash_llama_completion.base_url}/v1")
 								    stream = client.chat_completion(
 								        model="tgi",
 								        messages=[{"role": "user", "content": "Say 'OK!'"}],
 								        stream=True,
 								        max_tokens=10,
 								        seed=42,
 								        stream_options={"include_usage": False},
 								    )
 								    chunks = []
 								    for chunk in stream:
 								        assert chunk.usage is None
 								        chunks.append(chunk)
 								    assert chunks == response_snapshot