text-generation-inference/integration-tests/neuron/test_generate.py

import pytest


@pytest.fixture
async def tgi_service(neuron_launcher, neuron_model_config):
    model_name_or_path = neuron_model_config["neuron_model_path"]
    service_name = neuron_model_config["name"]
    with neuron_launcher(service_name, model_name_or_path) as tgi_service:
        await tgi_service.health(600)
        yield tgi_service


@pytest.mark.asyncio
async def test_model_single_request(tgi_service):
    service_name = tgi_service.client.service_name
    prompt = "What is Deep Learning?"
    # Greedy bounded without input
    response = await tgi_service.client.text_generation(
        prompt, max_new_tokens=17, details=True, decoder_input_details=True
    )
    assert response.details.generated_tokens == 17
    greedy_expectations = {
        "gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
        "llama": " and How Does it Work?\nDeep learning is a subset of machine learning that uses artificial",
        "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
        "qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",
        "granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",
    }
    assert response.generated_text == greedy_expectations[service_name]

    # Greedy bounded with input
    response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        max_new_tokens=17,
        return_full_text=True,
        details=True,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 17
    assert response.generated_text == prompt + greedy_expectations[service_name]

    # Sampling
    response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2,
        max_new_tokens=128,
        seed=42,
    )
    # The response must be different
    assert not response.startswith(greedy_expectations[service_name])

    # Sampling with stop sequence (using one of the words returned from the previous test)
    stop_sequence = response.split(" ")[-5]
    response = await tgi_service.client.text_generation(
        "What is Deep Learning?",
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2,
        max_new_tokens=128,
        seed=42,
        stop_sequences=[stop_sequence],
    )
    assert response.endswith(stop_sequence)


@pytest.mark.asyncio
async def test_model_multiple_requests(tgi_service, neuron_generate_load):
    num_requests = 4
    responses = await neuron_generate_load(
        tgi_service.client,
        "What is Deep Learning?",
        max_new_tokens=17,
        n=num_requests,
    )

    assert len(responses) == 4
    expectations = {
        "gpt2": "Deep learning is a new field of research that has been around for a while",
        "llama": "Deep learning is a subset of machine learning that uses artificial",
        "mistral": "Deep Learning is a type of machine learning that",
        "qwen2": "Deep Learning is a subset of Machine Learning that is based on",
        "granite": "Deep Learning is a subset of Machine Learning, which is a branch of Art",
    }
    expected = expectations[tgi_service.client.service_name]
    for r in responses:
        assert r.details.generated_tokens == 17
        assert expected in r.generated_text
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`import pytest`


			`@pytest.fixture`
			`async def tgi_service(neuron_launcher, neuron_model_config):`
			`model_name_or_path = neuron_model_config["neuron_model_path"]`
			`service_name = neuron_model_config["name"]`
			`with neuron_launcher(service_name, model_name_or_path) as tgi_service:`
			`await tgi_service.health(600)`
			`yield tgi_service`


			`@pytest.mark.asyncio`
			`async def test_model_single_request(tgi_service):`
			`service_name = tgi_service.client.service_name`
			`prompt = "What is Deep Learning?"`
			`# Greedy bounded without input`
			`response = await tgi_service.client.text_generation(`
			`prompt, max_new_tokens=17, details=True, decoder_input_details=True`
			`)`
			`assert response.details.generated_tokens == 17`
			`greedy_expectations = {`
			`"gpt2": "\n\nDeep learning is a new field of research that has been around for a while",`
			`"llama": " and How Does it Work?\nDeep learning is a subset of machine learning that uses artificial",`
			`"mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",`
			`"qwen2": " - Part 1\n\nDeep Learning is a subset of Machine Learning that is based on",`
			`"granite": "\n\nDeep Learning is a subset of Machine Learning, which is a branch of Art",`
			`}`
			`assert response.generated_text == greedy_expectations[service_name]`

			`# Greedy bounded with input`
			`response = await tgi_service.client.text_generation(`
fix: run linters and fix formatting (#3057) 2025-02-25 21:11:34 +00:00			`"What is Deep Learning?",`
			`max_new_tokens=17,`
			`return_full_text=True,`
			`details=True,`
			`decoder_input_details=True,`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`)`
			`assert response.details.generated_tokens == 17`
			`assert response.generated_text == prompt + greedy_expectations[service_name]`

			`# Sampling`
			`response = await tgi_service.client.text_generation(`
			`"What is Deep Learning?",`
			`do_sample=True,`
			`top_k=50,`
			`top_p=0.9,`
			`repetition_penalty=1.2,`
			`max_new_tokens=128,`
			`seed=42,`
			`)`
Update neuron backend (#3098) * feat(neuron): use AWS Neuron SDK 2.21.1 * feat(neuron): bump optimum-neuron version * feat(neuron): tag latest image for local tests * test(neuron): simplify sampling test 2025-03-12 08:53:15 +00:00			`# The response must be different`
			`assert not response.startswith(greedy_expectations[service_name])`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00
Update neuron backend (#3098) * feat(neuron): use AWS Neuron SDK 2.21.1 * feat(neuron): bump optimum-neuron version * feat(neuron): tag latest image for local tests * test(neuron): simplify sampling test 2025-03-12 08:53:15 +00:00			`# Sampling with stop sequence (using one of the words returned from the previous test)`
			`stop_sequence = response.split(" ")[-5]`
Add Neuron backend (#3033) * feat: add neuron backend * feat(neuron): add server standalone installation * feat(neuron): add server and integration tests * fix(neuron): increase ulimit when building image The base image used to compile the rust components seems to have a low ulimit for opened files, which leads to errors during compilation. * test(neuron): merge integration tests and fixtures * test: add --neuron option * review: do not use latest tag * review: remove ureq pinned version * review: --privileged should be the exception * feat: add neuron case to build ci * fix(neuron): export models from container in test fixtures The neuron tests require models to have been previously exported and cached on the hub. This is done automatically by the neuron.model fixture the first time the tests are ran for a specific version. This fixture used to export the models using optimum-neuron directly, but this package is not necessarily present on the system. Instead, it is now done through the neuron TGI itself, since it contains all the tools required to export the models. Note that since the CI runs docker in docker (dind) it does not seem possible to share a volume between the CI container and the container used to export the model. For that reason, a specific image with a modified entrypoint is built on-the-fly when a model export is required. * refactor: remove sagemaker entry-point The SageMaker image is built differently anyway. * fix(neuron): avoid using Levenshtein * test(neuron): use smaller llama model * feat(neuron): avoid installing CUDA in image * test(neuron): no error anymore when requesting too many tokens * ci: doing a precompilation step (with a different token). * test(neuron): avoid using image sha when exporting models We now manually evaluate the apparent hash of the neuron backend by combining the hash of the neuron backend directory and Dockerfile. This new hash is used to identify exported neuron models instead of the image sha. This has two benefits: - it changes less frequently (only hwen the neuron backend changes), which means less neuron models being pushed to the hub, - it can be evaluated locally, meaning that running the tests once locally will export the models before the CI uses them. * test(neuron): added a small script to prune test models --------- Co-authored-by: drbh <david.richard.holtz@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2025-02-24 08:10:05 +00:00			`response = await tgi_service.client.text_generation(`
			`"What is Deep Learning?",`
			`do_sample=True,`
			`top_k=50,`
			`top_p=0.9,`
			`repetition_penalty=1.2,`
			`max_new_tokens=128,`
			`seed=42,`
			`stop_sequences=[stop_sequence],`
			`)`
			`assert response.endswith(stop_sequence)`


			`@pytest.mark.asyncio`
			`async def test_model_multiple_requests(tgi_service, neuron_generate_load):`
			`num_requests = 4`
			`responses = await neuron_generate_load(`
			`tgi_service.client,`
			`"What is Deep Learning?",`
			`max_new_tokens=17,`
			`n=num_requests,`
			`)`

			`assert len(responses) == 4`
			`expectations = {`
			`"gpt2": "Deep learning is a new field of research that has been around for a while",`
			`"llama": "Deep learning is a subset of machine learning that uses artificial",`
			`"mistral": "Deep Learning is a type of machine learning that",`
			`"qwen2": "Deep Learning is a subset of Machine Learning that is based on",`
			`"granite": "Deep Learning is a subset of Machine Learning, which is a branch of Art",`
			`}`
			`expected = expectations[tgi_service.client.service_name]`
			`for r in responses:`
			`assert r.details.generated_tokens == 17`
			`assert expected in r.generated_text`